10 years ago · 60dbce940a
--- a/rtl/inc/text.inc
+++ b/rtl/inc/text.inc
@@ -2309,76 +2309,28 @@ end;
 
				 
			
 
				 
			
 
				 {$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
			
 
				-function UTF8CodePointLength(firstbyte: byte): SizeInt;
			
 
				-var
			
 
				-  firstzerobit: SizeInt;
			
 
				-begin
			
 
				-  result:=1;
			
 
				-  { bsr searches for the leftmost 1 bit. We are interested in the
			
 
				-    leftmost 0 bit, so first invert the value
			
 
				-  }
			
 
				-  firstzerobit:=BsrByte(not(firstbyte));
			
 
				-  { if there is no zero bit or the first zero bit is the rightmost bit
			
 
				-    (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
			
 
				-    UTF-8-encoded string, and in the worst case bit 1 has to be zero)
			
 
				-  }
			
 
				-  if (firstzerobit=0) or (firstzerobit=255)  then
			
 
				-    exit;
			
 
				-  { the number of bytes belonging to this code point is
			
 
				-    7-(pos first 0-bit).
			
 
				-  }
			
 
				-  result:=7-firstzerobit;
			
 
				-end;
			
 
				-
			
 
				-
			
 
				 function EndOfLastCompleteUTF8CodePoint(var t: textrec): SizeInt;
			
 
				 var
			
 
				-  i, lenfound, codepointlen: SizeInt;
			
 
				+  i, lenfound, codepointlen: sizeint;
			
 
				   b: byte;
			
 
				 begin
			
 
				   lenfound:=0;
			
 
				   for i:=t.bufpos-1 downto 0 do
			
 
				     begin
			
 
				-      b:=byte(t.bufptr^[i]);
			
 
				-      if b<=127 then
			
 
				+      { we don't care about combining diacritical marks here: we just want a
			
 
				+        valid UTF-8 codepoint that we can translate to UTF-16. The combining
			
 
				+        diacritical marks can be translated separately }
			
 
				+      codepointlen:=Utf8CodePointLen(pchar(@t.bufptr^[i]),(t.bufpos-1-i)+1,false);
			
 
				+      { complete codepoint -> flush till here }
			
 
				+      if codepointlen>0 then
			
 
				         begin
			
 
				-          if lenfound = 0 then
			
 
				-            { valid simple code point }
			
 
				-            result:=i+1
			
 
				-          else
			
 
				-            { valid simple code point followed by a bunch of invalid data ->
			
 
				-              handle everything since it can't become valid by adding more
			
 
				-              bytes }
			
 
				-            result:=t.bufpos;
			
 
				+          result:=i+codepointlen;
			
 
				           exit;
			
 
				-        end;
			
 
				-      { start of a complex character }
			
 
				-      if (b and %11000000)<>0 then
			
 
				-        begin
			
 
				-          codepointlen:=UTF8CodePointLength(b);
			
 
				-          { we did not yet get all bytes of the last code point -> handle
			
 
				-            everything until the start of this character }
			
 
				-          if codepointlen>lenfound+1 then
			
 
				-            if i<>0 then
			
 
				-              result:=i
			
 
				-            { the buffer is too small to contain the entire utf-8 code point
			
 
				-              -> nothing else to do but handle the entire buffer (and end up
			
 
				-              with an invalid character) -- since writestr uses the default
			
 
				-              buffer size of 32 bytes, this can only happen for invalid utf-8
			
 
				-              encodings }
			
 
				-            else
			
 
				-              result:=t.bufpos
			
 
				-          { the last code point is invalid -> handle everything since it can't
			
 
				-            become valid by adding more bytes; in case it's complete, we also
			
 
				-            handle everything, of course}
			
 
				-          else
			
 
				-            result:=t.bufpos;
			
 
				-          exit;
			
 
				-        end;
			
 
				-      inc(lenfound);
			
 
				+        end
			
 
				     end;
			
 
				   { all invalid data, or the buffer is too small to be able to deal with the
			
 
				-    complete utf8char -> nothing else to do but to handle the entire buffer }
			
 
				+    complete utf8char -> nothing else to do but to handle the entire buffer
			
 
				+    (and end up with a partial/invalid character) }
			
 
				   result:=t.bufpos;
			
 
				 end;