|
@@ -2309,76 +2309,28 @@ end;
|
|
|
|
|
|
|
|
|
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
|
|
|
-function UTF8CodePointLength(firstbyte: byte): SizeInt;
|
|
|
-var
|
|
|
- firstzerobit: SizeInt;
|
|
|
-begin
|
|
|
- result:=1;
|
|
|
- { bsr searches for the leftmost 1 bit. We are interested in the
|
|
|
- leftmost 0 bit, so first invert the value
|
|
|
- }
|
|
|
- firstzerobit:=BsrByte(not(firstbyte));
|
|
|
- { if there is no zero bit or the first zero bit is the rightmost bit
|
|
|
- (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
|
|
|
- UTF-8-encoded string, and in the worst case bit 1 has to be zero)
|
|
|
- }
|
|
|
- if (firstzerobit=0) or (firstzerobit=255) then
|
|
|
- exit;
|
|
|
- { the number of bytes belonging to this code point is
|
|
|
- 7-(pos first 0-bit).
|
|
|
- }
|
|
|
- result:=7-firstzerobit;
|
|
|
-end;
|
|
|
-
|
|
|
-
|
|
|
function EndOfLastCompleteUTF8CodePoint(var t: textrec): SizeInt;
|
|
|
var
|
|
|
- i, lenfound, codepointlen: SizeInt;
|
|
|
+ i, lenfound, codepointlen: sizeint;
|
|
|
b: byte;
|
|
|
begin
|
|
|
lenfound:=0;
|
|
|
for i:=t.bufpos-1 downto 0 do
|
|
|
begin
|
|
|
- b:=byte(t.bufptr^[i]);
|
|
|
- if b<=127 then
|
|
|
+ { we don't care about combining diacritical marks here: we just want a
|
|
|
+ valid UTF-8 codepoint that we can translate to UTF-16. The combining
|
|
|
+ diacritical marks can be translated separately }
|
|
|
+ codepointlen:=Utf8CodePointLen(pchar(@t.bufptr^[i]),(t.bufpos-1-i)+1,false);
|
|
|
+ { complete codepoint -> flush till here }
|
|
|
+ if codepointlen>0 then
|
|
|
begin
|
|
|
- if lenfound = 0 then
|
|
|
- { valid simple code point }
|
|
|
- result:=i+1
|
|
|
- else
|
|
|
- { valid simple code point followed by a bunch of invalid data ->
|
|
|
- handle everything since it can't become valid by adding more
|
|
|
- bytes }
|
|
|
- result:=t.bufpos;
|
|
|
+ result:=i+codepointlen;
|
|
|
exit;
|
|
|
- end;
|
|
|
- { start of a complex character }
|
|
|
- if (b and %11000000)<>0 then
|
|
|
- begin
|
|
|
- codepointlen:=UTF8CodePointLength(b);
|
|
|
- { we did not yet get all bytes of the last code point -> handle
|
|
|
- everything until the start of this character }
|
|
|
- if codepointlen>lenfound+1 then
|
|
|
- if i<>0 then
|
|
|
- result:=i
|
|
|
- { the buffer is too small to contain the entire utf-8 code point
|
|
|
- -> nothing else to do but handle the entire buffer (and end up
|
|
|
- with an invalid character) -- since writestr uses the default
|
|
|
- buffer size of 32 bytes, this can only happen for invalid utf-8
|
|
|
- encodings }
|
|
|
- else
|
|
|
- result:=t.bufpos
|
|
|
- { the last code point is invalid -> handle everything since it can't
|
|
|
- become valid by adding more bytes; in case it's complete, we also
|
|
|
- handle everything, of course}
|
|
|
- else
|
|
|
- result:=t.bufpos;
|
|
|
- exit;
|
|
|
- end;
|
|
|
- inc(lenfound);
|
|
|
+ end
|
|
|
end;
|
|
|
{ all invalid data, or the buffer is too small to be able to deal with the
|
|
|
- complete utf8char -> nothing else to do but to handle the entire buffer }
|
|
|
+ complete utf8char -> nothing else to do but to handle the entire buffer
|
|
|
+ (and end up with a partial/invalid character) }
|
|
|
result:=t.bufpos;
|
|
|
end;
|
|
|
|