{ Most of this code is based on similar functions from Lazarus LCLProc. } unit DCUnicodeUtils; {$mode objfpc}{$H+} interface uses Classes, SysUtils; {en Retrieves length in bytes of the next UTF-8 character. @param(P Pointer to the UTF-8 characters.) @param(aMaxBytes States how many bytes from P can be read.) @param(InvalidCharLen If an invalid UTF-8 character was found then InvalidCharLen has the number of bytes this character spans. If the character was valid InvalidCharLen is zero.) } function SafeUTF8NextCharLen(P: PByte; aMaxBytes: IntPtr; out InvalidCharLen: Integer): Integer; {en Retrieves length in bytes of the previous UTF-8 character. It does not read from P, but rather from memory locations before P. @param(P Pointer to the UTF-8 characters.) @param(aMaxBytes States how many bytes from P *backwards* can be read. So, to safely read 3 bytes backwards ([p-1], [p-2], [p-3]) this parameter should be at least 3.) @param(InvalidCharLen If an invalid UTF-8 character was found then InvalidCharLen has the number of bytes this character spans. If the character was valid InvalidCharLen is zero.) } function SafeUTF8PrevCharLen(P: PByte; aMaxBytes: IntPtr; out InvalidCharLen: Integer): Integer; function SafeUTF8NextCharStart(UTF8Str: PByte; Len: PtrInt): PByte; function SafeUTF8PrevCharEnd(UTF8Str: PByte; Len: PtrInt): PByte; {en Returns UTF-16 character length, which is either 1 or 2. @param(utf16char Any UTF-16 char or one of the surrogate pairs.) } function UTF16CharLen(utf16char: Word): Integer; {en Converts an UTF-16 surrogate pair into a unicode character. } function utf16PairToUnicode(u1, u2: Word): Cardinal; function Utf16LEToUtf8(const s: string): string; // UTF16-LE 2 or 4 byte little endian function Utf16BEToUtf8(const s: string): string; // UTF16-BE 2 or 4 byte big endian function Utf32LEToUtf8(const s: string): string; // UTF32-LE 4 byte little endian function Utf32BEToUtf8(const s: string): string; // UTF32-BE 4 byte big endian function Utf8ToUtf16LE(const s: string): string; // UTF16-LE 2 or 4 byte little endian function Utf8ToUtf16BE(const s: string): string; // UTF16-BE 2 or 4 byte big endian function UTF8ToUCS4(const UTF8Text: String): UCS4String; {en Replaces invalid UTF-8 characters with '?'. } function Utf8ReplaceBroken(const s: String): String; {en Replaces invalid UTF-8 characters with 0x1A (SUBSTITUTE). } procedure Utf8FixBroken(var S: String); procedure Utf16SwapEndian(var S: UnicodeString); implementation uses LazUTF8; const maxUTF8Len = 7; // really is 4, but this includes any invalid characters up to length 7 function SafeUTF8NextCharLen(P: PByte; aMaxBytes: IntPtr; out InvalidCharLen: Integer): Integer; var BytesLen: Integer; i: Integer; begin if (p=nil) or (aMaxBytes = 0) then begin InvalidCharLen := 0; Result := 0; end else if p^<%10000000 then begin // regular single byte character InvalidCharLen := 0; Result := 1; end else if p^<%11000000 then begin // invalid single byte character InvalidCharLen := 1; Result := 1; end else begin // Read length of UTF-8 character in bytes. if ((p^ and %11100000) = %11000000) then BytesLen := 2 else if ((p^ and %11110000) = %11100000) then BytesLen := 3 else if ((p^ and %11111000) = %11110000) then BytesLen := 4 else if ((p^ and %11111100) = %11111000) then BytesLen := 5 else if ((p^ and %11111110) = %11111100) then BytesLen := 6 else if ((p^ and %11111111) = %11111110) then BytesLen := 7 else begin InvalidCharLen := 1; exit(1); end; // Check if the next bytes are from the middle of a character. for i := 1 to BytesLen - 1 do begin if (aMaxBytes < i) or ((p[i] and %11000000) <> %10000000) then begin InvalidCharLen := i; exit(1); end; end; InvalidCharLen := 0; Result := BytesLen; end; end; function SafeUTF8PrevCharLen(P: PByte; aMaxBytes: IntPtr; out InvalidCharLen: Integer): Integer; var BytesLen: Integer; signature: Byte; begin if (p=nil) or (aMaxBytes = 0) then begin InvalidCharLen := 0; Result := 0; end else if p[-1]<%10000000 then begin // regular single byte character InvalidCharLen := 0; Result := 1; end else begin for BytesLen := 1 to maxUTF8Len do begin if (aMaxBytes < BytesLen) then begin InvalidCharLen := aMaxBytes; exit(1); end; // Move past all the bytes in the middle of a character. if (p[-BytesLen] and %11000000) <> %10000000 then break; if BytesLen = maxUTF8Len then begin InvalidCharLen := BytesLen; exit(1); end; end; if p[-BytesLen]<%11000000 then begin // invalid first byte of a character InvalidCharLen := BytesLen; Result := 1; end else begin signature := Byte($FF shl (7 - BytesLen)); if (p[-BytesLen] and signature) = Byte(signature shl 1) then begin // Correct first byte of a character. InvalidCharLen := 0; Result := BytesLen; end else begin // Invalid first byte of a character, or p is in the middle of a character. InvalidCharLen := BytesLen; Result := 1; end; end; end; end; function SafeUTF8NextCharStart(UTF8Str: PByte; Len: PtrInt): PByte; var CharLen: LongInt; InvalidCharLen: Integer; begin Result:=UTF8Str; if Result<>nil then begin while (Len>0) do begin CharLen := SafeUTF8NextCharLen(Result, Len, InvalidCharLen); if InvalidCharLen > 0 then begin dec(Len,InvalidCharLen); inc(Result,InvalidCharLen); end else if CharLen = 0 then exit(nil) else exit(Result); end; Result:=nil; end; end; function SafeUTF8PrevCharEnd(UTF8Str: PByte; Len: PtrInt): PByte; var CharLen: LongInt; InvalidCharLen: Integer; begin Result:=UTF8Str; if Result<>nil then begin while (Len>0) do begin CharLen := SafeUTF8PrevCharLen(Result, Len, InvalidCharLen); if InvalidCharLen > 0 then begin dec(Len,InvalidCharLen); dec(Result,InvalidCharLen); end else if CharLen = 0 then exit(nil) else exit(Result); // Result is the character beginning end; Result:=nil; end; end; function UTF16CharLen(utf16char: Word): Integer; inline; begin if (utf16char < $D800) or (utf16char > $DFFF) then Result := 1 else Result := 2; end; function utf16PairToUnicode(u1, u2: Word): Cardinal; begin if (u1 >= $D800) and (u1 <= $DBFF) then begin if (u2 >= $DC00) and (u2 <= $DFFF) then Result := (Cardinal(u1 - $D800) shl 10) + Cardinal(u2 - $DC00) + $10000 else Result := 0; end else Result := u1; end; function Utf16LEToUtf8(const s: string): string; var len: Integer; Src, Limit: PWord; Dest: PAnsiChar; u: Cardinal; begin if Length(s) < 2 then begin Result:=''; exit; end; Src:=PWord(Pointer(s)); Limit := PWord(Pointer(Src) + Length(s)); SetLength(Result, length(s) * 2); Dest:=PAnsiChar(Result); while Src + 1 <= Limit do begin len := UTF16CharLen(Src^); if len = 1 then u := LEtoN(Src^) else begin if Src + 2 <= Limit then u := utf16PairToUnicode(LEtoN(Src[0]), LEtoN(Src[1])) else break; end; inc(Src, len); if u<128 then begin Dest^:=chr(u); inc(Dest); end else begin inc(Dest,UnicodeToUTF8SkipErrors(u,Dest)); end; end; len:=PtrUInt(Dest)-PtrUInt(Result); Assert(len <= length(Result)); SetLength(Result,len); end; function Utf16BEToUtf8(const s: string): string; var len: Integer; Src, Limit: PWord; Dest: PAnsiChar; u: Cardinal; begin if Length(s) < 2 then begin Result:=''; exit; end; Src:=PWord(Pointer(s)); Limit := PWord(Pointer(Src) + Length(s)); SetLength(Result, length(s) * 2); Dest:=PAnsiChar(Result); while Src + 1 <= Limit do begin len := UTF16CharLen(swap(Src^)); if len = 1 then u := BEtoN(Src^) else begin if Src + 2 <= Limit then u := utf16PairToUnicode(BEtoN(Src[0]), BEtoN(Src[1])) else break; end; inc(Src, len); if u<128 then begin Dest^:=chr(u); inc(Dest); end else begin inc(Dest,UnicodeToUTF8SkipErrors(u,Dest)); end; end; len:=PtrUInt(Dest)-PtrUInt(Result); Assert(len <= length(Result)); SetLength(Result,len); end; function Utf32LEToUtf8(const s: string): string; var len: Integer; Src: PLongWord; Dest: PAnsiChar; i: Integer; c: LongWord; begin if Length(s) < 4 then begin Result:=''; exit; end; len:=length(s) div 4; SetLength(Result,len*4); Src:=PLongWord(Pointer(s)); Dest:=PAnsiChar(Result); for i:=1 to len do begin c:=LEtoN(Src^); inc(Src); if c<128 then begin Dest^:=chr(c); inc(Dest); end else begin inc(Dest,UnicodeToUTF8SkipErrors(c,Dest)); end; end; len:=PtrUInt(Dest)-PtrUInt(Result); Assert(len <= length(Result)); SetLength(Result,len); end; function Utf32BEToUtf8(const s: string): string; var len: Integer; Src: PLongWord; Dest: PAnsiChar; i: Integer; c: LongWord; begin if Length(s) < 4 then begin Result:=''; exit; end; len:=length(s) div 4; SetLength(Result,len*4); Src:=PLongWord(Pointer(s)); Dest:=PAnsiChar(Result); for i:=1 to len do begin c:=BEtoN(Src^); inc(Src); if c<128 then begin Dest^:=chr(c); inc(Dest); end else begin inc(Dest,UnicodeToUTF8SkipErrors(c,Dest)); end; end; len:=PtrUInt(Dest)-PtrUInt(Result); Assert(len <= length(Result)); SetLength(Result,len); end; function Utf8ToUtf16LE(const s: string): string; var L: SizeUInt; {$IF DEFINED(ENDIAN_BIG)} P: PWord; I: SizeInt; {$ENDIF} begin if Length(S) = 0 then begin Result := ''; Exit; end; // Wide chars of UTF-16 <= bytes of UTF-8 string SetLength(Result, Length(S) * SizeOf(WideChar)); if ConvertUTF8ToUTF16(PWideChar(PAnsiChar(Result)), Length(Result) + SizeOf(WideChar), PAnsiChar(S), Length(S), [toInvalidCharToSymbol], L) <> trNoError then Result := '' else begin SetLength(Result, (L - 1) * SizeOf(WideChar)); // Swap endian if needed {$IF DEFINED(ENDIAN_BIG)} P := PWord(PAnsiChar(Result)); for I := 0 to SizeInt(L) - 1 do begin P[I] := SwapEndian(P[I]); end; {$ENDIF} end; end; function Utf8ToUtf16BE(const s: string): string; var L: SizeUInt; {$IF DEFINED(ENDIAN_LITTLE)} P: PWord; I: SizeInt; {$ENDIF} begin if Length(S) = 0 then begin Result := ''; Exit; end; // Wide chars of UTF-16 <= bytes of UTF-8 string SetLength(Result, Length(S) * SizeOf(WideChar)); if ConvertUTF8ToUTF16(PWideChar(PAnsiChar(Result)), Length(Result) + SizeOf(WideChar), PAnsiChar(S), Length(S), [toInvalidCharToSymbol], L) <> trNoError then Result := '' else begin SetLength(Result, (L - 1) * SizeOf(WideChar)); // Swap endian if needed {$IF DEFINED(ENDIAN_LITTLE)} P := PWord(PAnsiChar(Result)); for I := 0 to SizeInt(L) - 1 do begin P[I] := SwapEndian(P[I]); end; {$ENDIF} end; end; function UTF8ToUCS4(const UTF8Text: String): UCS4String; var Len: PtrInt; Index: Integer; CharLen: Integer; SrcPos: PAnsiChar; begin Len:= Length(UTF8Text); SetLength(Result, Len); if Len = 0 then Exit; Index:= 0; SrcPos:= PAnsiChar(UTF8Text); while Len > 0 do begin Result[Index]:= UTF8CodepointToUnicode(SrcPos, CharLen); Inc(SrcPos, CharLen); Dec(Len, CharLen); Inc(Index); end; SetLength(Result, Index); end; function Utf8ReplaceBroken(const s: String): String; var Src, Dst, LastGoodPos: PByte; BytesLeft: Integer; InvalidCharLen: Integer; CharLen: Integer; begin if Length(s) = 0 then Exit(s); BytesLeft := Length(s); SetLength(Result, BytesLeft); // at most the same length Src := PByte(s); Dst := PByte(Result); LastGoodPos := Src; while BytesLeft > 0 do begin CharLen := SafeUTF8NextCharLen(Src, BytesLeft, InvalidCharLen); if InvalidCharLen > 0 then begin if LastGoodPos < Src then begin System.Move(LastGoodPos^, Dst^, Src - LastGoodPos); Inc(Dst, Src - LastGoodPos); end; Inc(Src, InvalidCharLen); Dec(BytesLeft, InvalidCharLen); LastGoodPos := Src; Dst^ := ord('?'); Inc(Dst); end else begin Inc(Src, CharLen); Dec(BytesLeft, CharLen); end; end; if LastGoodPos = PByte(s) then Result := s // All characters are good. else begin if LastGoodPos < Src then begin System.Move(LastGoodPos^, Dst^, Src - LastGoodPos); Inc(Dst, Src - LastGoodPos); end; SetLength(Result, Dst - PByte(Result)); end; end; procedure Utf8FixBroken(var S: String); var P: PAnsiChar; C, L: Integer; begin L:= Length(S); P:= Pointer(S); while (L > 0) do begin if Ord(P^) < %10000000 then begin // Regular single byte character C:= 1; end else if Ord(P^) < %11000000 then begin // Invalid character C:= 1; P^:= #26; end else if ((Ord(P^) and %11100000) = %11000000) then begin // Should be 2 byte character if (L > 1) and ((Ord(P[1]) and %11000000) = %10000000) then C:= 2 else begin // Invalid character C:= 1; P^:= #26; end; end else if ((Ord(P^) and %11110000) = %11100000) then begin // Should be 3 byte character if (L > 2) and ((Ord(P[1]) and %11000000) = %10000000) and ((Ord(P[2]) and %11000000) = %10000000) then C:= 3 else begin // Invalid character C:= 1; P^:= #26; end end else if ((Ord(P^) and %11111000) = %11110000) then begin // Should be 4 byte character if (L > 3) and ((Ord(P[1]) and %11000000) = %10000000) and ((Ord(P[2]) and %11000000) = %10000000) and ((Ord(P[3]) and %11000000) = %10000000) then C:= 4 else begin // Invalid character C:= 1; P^:= #26; end end else begin // Invalid character C:= 1; P^:= #26; end; Dec(L, C); Inc(P, C); end; end; procedure Utf16SwapEndian(var S: UnicodeString); var P: PWord; I, L: Integer; begin L:= Length(S); P:= PWord(PWideChar(S)); for I:= 0 to L - 1 do begin P[I]:= SwapEndian(P[I]); end; end; end.