10 vuotta sitten · ff020a3be4
--- a/.gitattributes
+++ b/.gitattributes
@@ -12392,6 +12392,7 @@ tests/test/tunit3.pp svneol=native#text/plain
 
															 tests/test/tunroll1.pp svneol=native#text/plain
														
 
															 tests/test/tutf81.pp svneol=native#text/plain
														
 
															 tests/test/tutf82.pp svneol=native#text/plain
														
 
															+tests/test/tutf8cpl.pp svneol=native#text/plain
														
 
															 tests/test/tvarpropsetter1.pp svneol=native#text/plain
														
 
															 tests/test/tvarpropsetter2.pp svneol=native#text/plain
														
 
															 tests/test/tvarset1.pp svneol=native#text/plain
														
--- a/rtl/inc/generic.inc
+++ b/rtl/inc/generic.inc
@@ -1076,6 +1076,144 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
 
															 {$endif not cpujvm}
														
 
															+
														
 
															+function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
														
 
															+  var
														
 
															+    bytes: sizeint;
														
 
															+    firstzerobit: byte;
														
 
															+  begin
														
 
															+    { see https://en.wikipedia.org/wiki/UTF-8#Description for details }
														
 
															+
														
 
															+    if maxlookahead<=0 then
														
 
															+      begin
														
 
															+        { incomplete }
														
 
															+        result:=0;
														
 
															+        exit;
														
 
															+      end;
														
 
															+    { inlcude the first byte }
														
 
															+    result:=1;
														
 
															+    { multiple byte utf-8 code point? }
														
 
															+    if p[0]>#127 then
														
 
															+      begin
														
 
															+        { bsr searches for the leftmost 1 bit. We are interested in the
														
 
															+          leftmost 0 bit, so first invert the value
														
 
															+        }
														
 
															+        firstzerobit:=bsrbyte(not(byte(p[0])));
														
 
															+        { if there is no zero bit or the first zero bit is the rightmost bit
														
 
															+          (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
														
 
															+          UTF-8-encoded string, and in the worst case bit 1 has to be zero)
														
 
															+          Additionally, 5-byte UTF-8 sequences don't exist either, so bit 1
														
 
															+          cannot be the first zero-bit either. And bits 6 and 7 can't be 0
														
 
															+          either in the first byte.
														
 
															+        }
														
 
															+        if (firstzerobit<=1) or (firstzerobit>=6)  then
														
 
															+          begin
														
 
															+            result:=-result;
														
 
															+            exit;
														
 
															+          end;
														
 
															+        { the number of bytes belonging to this code point is
														
 
															+          7-(pos first 0-bit). Subtract 1 since we're already at the first
														
 
															+          byte. All subsequent bytes of the same sequence must have their
														
 
															+          highest bit set and the next one unset. We stop when we detect an
														
 
															+          invalid sequence.
														
 
															+        }
														
 
															+        bytes:=6-firstzerobit;
														
 
															+        while (result<maxlookahead) and
														
 
															+              (bytes>0) and
														
 
															+              ((ord(p[result]) and %11000000)=%10000000) do
														
 
															+          begin
														
 
															+            inc(result);
														
 
															+            dec(bytes);
														
 
															+          end;
														
 
															+        { stopped because of invalid/incomplete sequence -> exit }
														
 
															+        if bytes<>0 then
														
 
															+          begin
														
 
															+            if result>=maxlookahead then
														
 
															+              result:=0
														
 
															+            else
														
 
															+              result:=-result;
														
 
															+            exit;
														
 
															+          end;
														
 
															+      end;
														
 
															+    if includecombiningdiacriticalmarks then
														
 
															+      begin
														
 
															+        { combining diacritical marks?
														
 
															+            1) U+0300 - U+036F in UTF-8 = %11001100 10000000 - %11001101 10101111
														
 
															+            2) U+1AB0 - U+1AFF in UTF-8 = %11100001 10101010 10110000 - %11100001 10101011 10111111
														
 
															+            3) U+1DC0 - U+1DFF in UTF-8 = %11100001 10110111 10000000 - %11100001 10110111 10111111
														
 
															+            4) U+20D0 - U+20FF in UTF-8 = %11100010 10000011 10010000 - %11100010 10000011 10111111
														
 
															+            5) U+FE20 - U+FE2F in UTF-8 = %11101111 10111000 10100000 - %11101111 10111000 10101111
														
 
															+        }
														
 
															+        repeat
														
 
															+          bytes:=result;
														
 
															+          if result+1<maxlookahead then
														
 
															+            begin
														
 
															+              { case 1) }
														
 
															+              if ((ord(p[result]) and %11001100=%11001100)) and
														
 
															+                  (ord(p[result+1])>=%10000000) and
														
 
															+                  (ord(p[result+1])<=%10101111) then
														
 
															+                inc(result,2)
														
 
															+                  { case 2), 3), 4), 5) }
														
 
															+              else if (result+2<maxlookahead) and
														
 
															+                 (ord(p[result])>=%11100001) then
														
 
															+                begin
														
 
															+                     { case 2) }
														
 
															+                  if ((ord(p[result])=%11100001) and
														
 
															+                      (ord(p[result+1])=%10101010) and
														
 
															+                      (ord(p[result+2])>=%10110000) and
														
 
															+                      (ord(p[result+2])<=%10111111)) or
														
 
															+                     { case 3) }
														
 
															+                     ((ord(p[result])=%11100001) and
														
 
															+                      (ord(p[result+1])=%10110111) and
														
 
															+                      (ord(p[result+2])>=%10000000) and
														
 
															+                      (ord(p[result+2])<=%10111111)) or
														
 
															+                     { case 4) }
														
 
															+                     ((ord(p[result])=%11100010) and
														
 
															+                      (ord(p[result+1])=%10000011) and
														
 
															+                      (ord(p[result+2])>=%10010000) and
														
 
															+                      (ord(p[result+2])<=%10111111)) or
														
 
															+                     { case 5) }
														
 
															+                     ((ord(p[result])=%11101111) and
														
 
															+                      (ord(p[result+1])=%10111000) and
														
 
															+                      (ord(p[result+2])>=%10100000) and
														
 
															+                      (ord(p[result+2])<=%10101111)) then
														
 
															+                    inc(result,3);
														
 
															+                end;
														
 
															+            end;
														
 
															+        until bytes=result;
														
 
															+        { is there an incomplete diacritical mark? (invalid makes little sense:
														
 
															+          either a sequence is a combining diacritical mark, or it's not ; if
														
 
															+          it's invalid, it may also not have been a combining diacritical mark)
														
 
															+        }
														
 
															+        if result<maxlookahead then
														
 
															+          begin
														
 
															+               { case 1) }
														
 
															+            if (((ord(p[result]) and %11001100=%11001100)) and
														
 
															+                (result+1>=maxlookahead)) or
														
 
															+               { case 2) and 3)}
														
 
															+               ((ord(p[result])=%11100001) and
														
 
															+                ((result+1>=maxlookahead) or
														
 
															+                 (((ord(p[result+1])=%10101010) or
														
 
															+                   (ord(p[result+1])=%10110111)) and
														
 
															+                  (result+2>=maxlookahead)))) or
														
 
															+               { case 4 }
														
 
															+               ((ord(p[result])=%11100010) and
														
 
															+                ((result+1>=maxlookahead) or
														
 
															+                 ((ord(p[result+1])=%10000011) and
														
 
															+                  (result+2>=maxlookahead)))) or
														
 
															+               { case 5 }
														
 
															+               ((ord(p[result])=%11101111) and
														
 
															+                ((result+1>=maxlookahead) or
														
 
															+                 ((ord(p[result+1])=%10111000) and
														
 
															+                  (result+2>=maxlookahead)))) then
														
 
															+              begin
														
 
															+                result:=0;
														
 
															+                exit;
														
 
															+              end;
														
 
															+          end;
														
 
															+      end;
														
 
															+  end;
														
 
															+
														
 
															 {$ifndef FPC_SYSTEM_HAS_FPC_CHARARRAY_TO_SHORTSTR}
														
 
															 procedure fpc_chararray_to_shortstr(out res : shortstring;const arr: array of char; zerobased: boolean = true);[public,alias:'FPC_CHARARRAY_TO_SHORTSTR']; compilerproc;
														
--- a/rtl/inc/systemh.inc
+++ b/rtl/inc/systemh.inc
@@ -1081,6 +1081,14 @@ Function  Sseg:Word;{$ifdef SYSTEMINLINE}inline;{$endif}
 
															 function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
														
 
															 function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH';
														
 
															+{ result:
														
 
															+  <0: invalid sequence detected after processing "-result" bytes
														
 
															+  0: incomplete (may still be valid if MaxLookAhead is increased)
														
 
															+  >0: sequence of result bytes forms a codepoint (+ combining diacritics if that
														
 
															+      parameter was true)
														
 
															+}
														
 
															+function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
														
 
															+
														
 
															 { Shortstring functions }
														
 
															 Procedure Delete(var s:shortstring;index:SizeInt;count:SizeInt);
														
 
															 Procedure Insert(const source:shortstring;var s:shortstring;index:SizeInt);
														
--- a/rtl/java/jsystemh.inc
+++ b/rtl/java/jsystemh.inc
@@ -442,6 +442,15 @@ function strpas(p:pchar):shortstring;{$ifdef SYSTEMINLINE}inline;{$endif}
 
															 function strlen(p:pchar):sizeint;external name 'FPC_PCHAR_LENGTH';
														
 
															 *)
														
 
															+{ result:
														
 
															+  <0: invalid sequence detected after processing "-result" bytes
														
 
															+  0: incomplete (may still be valid if MaxLookAhead is increased)
														
 
															+  >0: sequence of result bytes forms a codepoint (+ combining diacritics if that
														
 
															+      parameter was true)
														
 
															+}
														
 
															+function Utf8CodePointLen(P: PAnsiChar; MaxLookAhead: SizeInt; IncludeCombiningDiacriticalMarks: Boolean): SizeInt;
														
 
															+
														
 
															+
														
 
															 var
														
 
															   { separated compared to generic version, for Java type safety }
														
 
															   FPC_EMPTYANSICHAR : array[0..0] of ansichar;
														
--- a/tests/test/tutf8cpl.pp
+++ b/tests/test/tutf8cpl.pp
@@ -0,0 +1,148 @@
 
															+{$mode objfpc}
														
 
															+{$codepage utf8}
														
 
															+
														
 
															+var
														
 
															+  name: utf8string;
														
 
															+  
														
 
															+procedure check(index, lookahead: longint; combiningdiacritics: boolean; expectedresult: longint; checknr: longint);
														
 
															+begin
														
 
															+  if Utf8CodePointLen(pchar(@name[index]),lookahead,combiningdiacritics)<>expectedresult then
														
 
															+    begin
														
 
															+      writeln('check ',checknr,': Utf8CodePointLen(',copy(name,index,length(name)),',',lookahead,',',combiningdiacritics,') = ',Utf8CodePointLen(pchar(@name[index]),lookahead,false),' <> expected ',expectedresult);
														
 
															+      halt(1);
														
 
															+    end;
														
 
															+end;
														
 
															+
														
 
															+begin
														
 
															+  name:='a';
														
 
															+  check(1,0,false,0,1);
														
 
															+  check(1,1,false,1,2);
														
 
															+  check(1,1,true,1,3);
														
 
															+  check(1,6,false,1,4);
														
 
															+  check(1,6,true,1,5);
														
 
															+  name:='ab';
														
 
															+  check(1,6,false,1,6);
														
 
															+  check(1,6,true,1,7);
														
 
															+  check(1,1,false,1,8);
														
 
															+  check(1,1,true,1,9);
														
 
															+  check(2,6,false,1,10);
														
 
															+  check(2,6,true,1,11);
														
 
															+  name:='é';
														
 
															+  check(1,1,false,0,12);
														
 
															+  check(1,1,true,0,13);
														
 
															+  check(1,2,false,2,14);
														
 
															+  check(1,2,true,2,15);
														
 
															+  check(2,1,false,-1,16);
														
 
															+  check(2,1,true,-1,17);
														
 
															+  check(2,3,false,-1,18);
														
 
															+  check(2,3,true,-1,19);
														
 
															+  name:='éa';
														
 
															+  check(1,1,false,0,20);
														
 
															+  check(1,1,true,0,21);
														
 
															+  check(1,2,false,2,22);
														
 
															+  check(1,2,true,2,23);
														
 
															+  check(2,1,false,-1,24);
														
 
															+  check(2,1,true,-1,25);
														
 
															+  check(2,3,false,-1,26);
														
 
															+  check(2,3,true,-1,27);
														
 
															+  check(3,1,false,1,28);
														
 
															+  check(3,1,true,1,29);
														
 
															+  check(3,4,false,1,30);
														
 
															+  check(3,4,true,1,31);
														
 
															+  name[3]:=name[2];
														
 
															+  check(1,1,false,0,32);
														
 
															+  check(1,1,true,0,33);
														
 
															+  check(1,2,false,2,34);
														
 
															+  check(1,2,true,2,35);
														
 
															+  check(2,1,false,-1,36);
														
 
															+  check(2,1,true,-1,37);
														
 
															+  check(2,3,false,-1,38);
														
 
															+  check(2,3,true,-1,39);
														
 
															+  check(3,1,false,-1,40);
														
 
															+  check(3,1,true,-1,41);
														
 
															+  check(3,4,false,-1,42);
														
 
															+  check(3,4,true,-1,43);
														
 
															+  { e + combining ` }
														
 
															+  name:='e'#$0300'b';
														
 
															+  { check just the e without accent }
														
 
															+  check(1,1,false,1,44);
														
 
															+  check(1,1,true,1,45);
														
 
															+  check(1,2,false,1,46);
														
 
															+  { partial diacritical mark }
														
 
															+  check(1,2,true,0,47);
														
 
															+  check(1,3,false,1,48);
														
 
															+  { complete diacritical mark }
														
 
															+  check(1,3,true,3,49);
														
 
															+  check(1,4,false,1,50);
														
 
															+  { complete diacritical mark (ignore extra character) }
														
 
															+  check(1,4,true,3,51);
														
 
															+  { start of combining diacritical mark -- treated as independent utf-8 codepoint }
														
 
															+  check(2,1,false,0,52);
														
 
															+  check(2,1,true,0,53);
														
 
															+  check(2,3,false,2,54);
														
 
															+  check(2,3,true,2,55);
														
 
															+  { middle of the combining diacritical mark }
														
 
															+  check(3,1,false,-1,56);
														
 
															+  check(3,1,true,-1,57);
														
 
															+  check(3,4,false,-1,58);
														
 
															+  check(3,4,true,-1,59);
														
 
															+  { corrupt diacritical mark = no diacritical mark }
														
 
															+  name[3]:=name[4];
														
 
															+  { partial diacritical mark (the corrupted byte is not included in the
														
 
															+    lookahead) }
														
 
															+  check(1,2,true,0,60);
														
 
															+  check(1,3,false,1,61);
														
 
															+  { ignore corrupt diacritical mark }
														
 
															+  check(1,3,true,1,62);
														
 
															+  check(1,4,false,1,63);
														
 
															+  check(1,4,true,1,64);
														
 
															+  { e + combining circle + combining superscript 'n' }
														
 
															+  name:='e'#$20DD#$1DE0'b';
														
 
															+  { partial diacritical mark }
														
 
															+  check(1,2,true,0,65);
														
 
															+  check(1,3,false,1,66);
														
 
															+  check(1,3,true,0,67);
														
 
															+  check(1,4,false,1,68);
														
 
															+  { complete diacritical mark }
														
 
															+  check(1,4,true,4,69);
														
 
															+  check(1,4,false,1,70);
														
 
															+  { partial second diacritical mark }
														
 
															+  check(1,5,true,0,71);
														
 
															+  check(1,5,false,1,72);
														
 
															+  check(1,6,true,0,73);
														
 
															+  check(1,6,false,1,74);
														
 
															+  { complete both diacritical marks }
														
 
															+  check(1,7,true,7,75);
														
 
															+  check(1,7,false,1,76);
														
 
															+  check(1,10,true,7,77);
														
 
															+  check(1,10,false,1,78);
														
 
															+  { complete both diacritical marks without first character }
														
 
															+  check(2,6,true,6,79);
														
 
															+  check(2,20,true,6,80);
														
 
															+  { only the first one, treated as independent codepoint }
														
 
															+  check(2,7,false,3,81);
														
 
															+  { corrupt second diacritical mark }
														
 
															+  name[7]:=name[8];
														
 
															+  { partial second diacritical mark }
														
 
															+  check(1,5,true,0,82);
														
 
															+  check(1,5,false,1,83);
														
 
															+  check(1,6,true,0,84);
														
 
															+  check(1,6,false,1,85);
														
 
															+  { including bad byte -> ignore second diacritical mark completely
														
 
															+    (can't know it's part of a diacritical mark or something else) }
														
 
															+  check(1,7,true,4,86);
														
 
															+  check(1,7,false,1,87);
														
 
															+  { complete both diacritical marks without first character,
														
 
															+    but with corrupted byte }
														
 
															+  check(2,7,true,3,88);
														
 
															+  check(2,20,true,3,89);
														
 
															+  { corrupted diacritical mark by itself }
														
 
															+  { 1) incomplete }
														
 
															+  check(5,1,false,0,90);
														
 
															+  check(5,1,true,0,91);
														
 
															+  check(5,2,false,0,92);
														
 
															+  check(5,2,true,0,93);
														
 
															+  { 2) invalid }
														
 
															+  check(5,3,false,-2,94);
														
 
															+  check(5,3,true,-2,95);
														
 
															+end.