| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616 |
- {
- Most of this code is based on similar functions from Lazarus LCLProc.
- }
- unit DCUnicodeUtils;
- {$mode objfpc}{$H+}
- interface
- uses
- Classes, SysUtils;
- {en
- Retrieves length in bytes of the next UTF-8 character.
- @param(P
- Pointer to the UTF-8 characters.)
- @param(aMaxBytes
- States how many bytes from P can be read.)
- @param(InvalidCharLen
- If an invalid UTF-8 character was found then InvalidCharLen has
- the number of bytes this character spans. If the character was valid
- InvalidCharLen is zero.)
- }
- function SafeUTF8NextCharLen(P: PByte; aMaxBytes: IntPtr; out InvalidCharLen: Integer): Integer;
- {en
- Retrieves length in bytes of the previous UTF-8 character.
- It does not read from P, but rather from memory locations before P.
- @param(P
- Pointer to the UTF-8 characters.)
- @param(aMaxBytes
- States how many bytes from P *backwards* can be read.
- So, to safely read 3 bytes backwards ([p-1], [p-2], [p-3])
- this parameter should be at least 3.)
- @param(InvalidCharLen
- If an invalid UTF-8 character was found then InvalidCharLen has
- the number of bytes this character spans. If the character was valid
- InvalidCharLen is zero.)
- }
- function SafeUTF8PrevCharLen(P: PByte; aMaxBytes: IntPtr; out InvalidCharLen: Integer): Integer;
- function SafeUTF8NextCharStart(UTF8Str: PByte; Len: PtrInt): PByte;
- function SafeUTF8PrevCharEnd(UTF8Str: PByte; Len: PtrInt): PByte;
- {en
- Returns UTF-16 character length, which is either 1 or 2.
- @param(utf16char
- Any UTF-16 char or one of the surrogate pairs.)
- }
- function UTF16CharLen(utf16char: Word): Integer;
- {en
- Converts an UTF-16 surrogate pair into a unicode character.
- }
- function utf16PairToUnicode(u1, u2: Word): Cardinal;
- function Utf16LEToUtf8(const s: string): string; // UTF16-LE 2 or 4 byte little endian
- function Utf16BEToUtf8(const s: string): string; // UTF16-BE 2 or 4 byte big endian
- function Utf32LEToUtf8(const s: string): string; // UTF32-LE 4 byte little endian
- function Utf32BEToUtf8(const s: string): string; // UTF32-BE 4 byte big endian
- function Utf8ToUtf16LE(const s: string): string; // UTF16-LE 2 or 4 byte little endian
- function Utf8ToUtf16BE(const s: string): string; // UTF16-BE 2 or 4 byte big endian
- function UTF8ToUCS4(const UTF8Text: String): UCS4String;
- {en
- Replaces invalid UTF-8 characters with '?'.
- }
- function Utf8ReplaceBroken(const s: String): String;
- {en
- Replaces invalid UTF-8 characters with 0x1A (SUBSTITUTE).
- }
- procedure Utf8FixBroken(var S: String);
- procedure Utf16SwapEndian(var S: UnicodeString);
- implementation
- uses
- LazUTF8;
- const
- maxUTF8Len = 7; // really is 4, but this includes any invalid characters up to length 7
- function SafeUTF8NextCharLen(P: PByte; aMaxBytes: IntPtr; out InvalidCharLen: Integer): Integer;
- var
- BytesLen: Integer;
- i: Integer;
- begin
- if (p=nil) or (aMaxBytes = 0) then
- begin
- InvalidCharLen := 0;
- Result := 0;
- end
- else if p^<%10000000 then begin
- // regular single byte character
- InvalidCharLen := 0;
- Result := 1;
- end
- else if p^<%11000000 then begin
- // invalid single byte character
- InvalidCharLen := 1;
- Result := 1;
- end
- else
- begin
- // Read length of UTF-8 character in bytes.
- if ((p^ and %11100000) = %11000000) then BytesLen := 2
- else if ((p^ and %11110000) = %11100000) then BytesLen := 3
- else if ((p^ and %11111000) = %11110000) then BytesLen := 4
- else if ((p^ and %11111100) = %11111000) then BytesLen := 5
- else if ((p^ and %11111110) = %11111100) then BytesLen := 6
- else if ((p^ and %11111111) = %11111110) then BytesLen := 7
- else
- begin
- InvalidCharLen := 1;
- exit(1);
- end;
- // Check if the next bytes are from the middle of a character.
- for i := 1 to BytesLen - 1 do
- begin
- if (aMaxBytes < i) or ((p[i] and %11000000) <> %10000000) then
- begin
- InvalidCharLen := i;
- exit(1);
- end;
- end;
- InvalidCharLen := 0;
- Result := BytesLen;
- end;
- end;
- function SafeUTF8PrevCharLen(P: PByte; aMaxBytes: IntPtr; out InvalidCharLen: Integer): Integer;
- var
- BytesLen: Integer;
- signature: Byte;
- begin
- if (p=nil) or (aMaxBytes = 0) then
- begin
- InvalidCharLen := 0;
- Result := 0;
- end
- else if p[-1]<%10000000 then begin
- // regular single byte character
- InvalidCharLen := 0;
- Result := 1;
- end
- else
- begin
- for BytesLen := 1 to maxUTF8Len do
- begin
- if (aMaxBytes < BytesLen) then
- begin
- InvalidCharLen := aMaxBytes;
- exit(1);
- end;
- // Move past all the bytes in the middle of a character.
- if (p[-BytesLen] and %11000000) <> %10000000 then
- break;
- if BytesLen = maxUTF8Len then
- begin
- InvalidCharLen := BytesLen;
- exit(1);
- end;
- end;
- if p[-BytesLen]<%11000000 then
- begin
- // invalid first byte of a character
- InvalidCharLen := BytesLen;
- Result := 1;
- end
- else
- begin
- signature := Byte($FF shl (7 - BytesLen));
- if (p[-BytesLen] and signature) = Byte(signature shl 1) then
- begin
- // Correct first byte of a character.
- InvalidCharLen := 0;
- Result := BytesLen;
- end
- else
- begin
- // Invalid first byte of a character, or p is in the middle of a character.
- InvalidCharLen := BytesLen;
- Result := 1;
- end;
- end;
- end;
- end;
- function SafeUTF8NextCharStart(UTF8Str: PByte; Len: PtrInt): PByte;
- var
- CharLen: LongInt;
- InvalidCharLen: Integer;
- begin
- Result:=UTF8Str;
- if Result<>nil then begin
- while (Len>0) do begin
- CharLen := SafeUTF8NextCharLen(Result, Len, InvalidCharLen);
- if InvalidCharLen > 0 then
- begin
- dec(Len,InvalidCharLen);
- inc(Result,InvalidCharLen);
- end
- else if CharLen = 0 then
- exit(nil)
- else
- exit(Result);
- end;
- Result:=nil;
- end;
- end;
- function SafeUTF8PrevCharEnd(UTF8Str: PByte; Len: PtrInt): PByte;
- var
- CharLen: LongInt;
- InvalidCharLen: Integer;
- begin
- Result:=UTF8Str;
- if Result<>nil then begin
- while (Len>0) do begin
- CharLen := SafeUTF8PrevCharLen(Result, Len, InvalidCharLen);
- if InvalidCharLen > 0 then
- begin
- dec(Len,InvalidCharLen);
- dec(Result,InvalidCharLen);
- end
- else if CharLen = 0 then
- exit(nil)
- else
- exit(Result); // Result is the character beginning
- end;
- Result:=nil;
- end;
- end;
- function UTF16CharLen(utf16char: Word): Integer; inline;
- begin
- if (utf16char < $D800) or (utf16char > $DFFF) then
- Result := 1
- else
- Result := 2;
- end;
- function utf16PairToUnicode(u1, u2: Word): Cardinal;
- begin
- if (u1 >= $D800) and (u1 <= $DBFF) then
- begin
- if (u2 >= $DC00) and (u2 <= $DFFF) then
- Result := (Cardinal(u1 - $D800) shl 10) + Cardinal(u2 - $DC00) + $10000
- else
- Result := 0;
- end
- else
- Result := u1;
- end;
- function Utf16LEToUtf8(const s: string): string;
- var
- len: Integer;
- Src, Limit: PWord;
- Dest: PAnsiChar;
- u: Cardinal;
- begin
- if Length(s) < 2 then begin
- Result:='';
- exit;
- end;
- Src:=PWord(Pointer(s));
- Limit := PWord(Pointer(Src) + Length(s));
- SetLength(Result, length(s) * 2);
- Dest:=PAnsiChar(Result);
- while Src + 1 <= Limit do begin
- len := UTF16CharLen(Src^);
- if len = 1 then
- u := LEtoN(Src^)
- else
- begin
- if Src + 2 <= Limit then
- u := utf16PairToUnicode(LEtoN(Src[0]), LEtoN(Src[1]))
- else
- break;
- end;
- inc(Src, len);
- if u<128 then begin
- Dest^:=chr(u);
- inc(Dest);
- end else begin
- inc(Dest,UnicodeToUTF8SkipErrors(u,Dest));
- end;
- end;
- len:=PtrUInt(Dest)-PtrUInt(Result);
- Assert(len <= length(Result));
- SetLength(Result,len);
- end;
- function Utf16BEToUtf8(const s: string): string;
- var
- len: Integer;
- Src, Limit: PWord;
- Dest: PAnsiChar;
- u: Cardinal;
- begin
- if Length(s) < 2 then begin
- Result:='';
- exit;
- end;
- Src:=PWord(Pointer(s));
- Limit := PWord(Pointer(Src) + Length(s));
- SetLength(Result, length(s) * 2);
- Dest:=PAnsiChar(Result);
- while Src + 1 <= Limit do begin
- len := UTF16CharLen(swap(Src^));
- if len = 1 then
- u := BEtoN(Src^)
- else
- begin
- if Src + 2 <= Limit then
- u := utf16PairToUnicode(BEtoN(Src[0]), BEtoN(Src[1]))
- else
- break;
- end;
- inc(Src, len);
- if u<128 then begin
- Dest^:=chr(u);
- inc(Dest);
- end else begin
- inc(Dest,UnicodeToUTF8SkipErrors(u,Dest));
- end;
- end;
- len:=PtrUInt(Dest)-PtrUInt(Result);
- Assert(len <= length(Result));
- SetLength(Result,len);
- end;
- function Utf32LEToUtf8(const s: string): string;
- var
- len: Integer;
- Src: PLongWord;
- Dest: PAnsiChar;
- i: Integer;
- c: LongWord;
- begin
- if Length(s) < 4 then begin
- Result:='';
- exit;
- end;
- len:=length(s) div 4;
- SetLength(Result,len*4);
- Src:=PLongWord(Pointer(s));
- Dest:=PAnsiChar(Result);
- for i:=1 to len do begin
- c:=LEtoN(Src^);
- inc(Src);
- if c<128 then begin
- Dest^:=chr(c);
- inc(Dest);
- end else begin
- inc(Dest,UnicodeToUTF8SkipErrors(c,Dest));
- end;
- end;
- len:=PtrUInt(Dest)-PtrUInt(Result);
- Assert(len <= length(Result));
- SetLength(Result,len);
- end;
- function Utf32BEToUtf8(const s: string): string;
- var
- len: Integer;
- Src: PLongWord;
- Dest: PAnsiChar;
- i: Integer;
- c: LongWord;
- begin
- if Length(s) < 4 then begin
- Result:='';
- exit;
- end;
- len:=length(s) div 4;
- SetLength(Result,len*4);
- Src:=PLongWord(Pointer(s));
- Dest:=PAnsiChar(Result);
- for i:=1 to len do begin
- c:=BEtoN(Src^);
- inc(Src);
- if c<128 then begin
- Dest^:=chr(c);
- inc(Dest);
- end else begin
- inc(Dest,UnicodeToUTF8SkipErrors(c,Dest));
- end;
- end;
- len:=PtrUInt(Dest)-PtrUInt(Result);
- Assert(len <= length(Result));
- SetLength(Result,len);
- end;
- function Utf8ToUtf16LE(const s: string): string;
- var
- L: SizeUInt;
- {$IF DEFINED(ENDIAN_BIG)}
- P: PWord;
- I: SizeInt;
- {$ENDIF}
- begin
- if Length(S) = 0 then
- begin
- Result := '';
- Exit;
- end;
- // Wide chars of UTF-16 <= bytes of UTF-8 string
- SetLength(Result, Length(S) * SizeOf(WideChar));
- if ConvertUTF8ToUTF16(PWideChar(PAnsiChar(Result)), Length(Result) + SizeOf(WideChar),
- PAnsiChar(S), Length(S), [toInvalidCharToSymbol], L) <> trNoError
- then
- Result := ''
- else
- begin
- SetLength(Result, (L - 1) * SizeOf(WideChar));
- // Swap endian if needed
- {$IF DEFINED(ENDIAN_BIG)}
- P := PWord(PAnsiChar(Result));
- for I := 0 to SizeInt(L) - 1 do
- begin
- P[I] := SwapEndian(P[I]);
- end;
- {$ENDIF}
- end;
- end;
- function Utf8ToUtf16BE(const s: string): string;
- var
- L: SizeUInt;
- {$IF DEFINED(ENDIAN_LITTLE)}
- P: PWord;
- I: SizeInt;
- {$ENDIF}
- begin
- if Length(S) = 0 then
- begin
- Result := '';
- Exit;
- end;
- // Wide chars of UTF-16 <= bytes of UTF-8 string
- SetLength(Result, Length(S) * SizeOf(WideChar));
- if ConvertUTF8ToUTF16(PWideChar(PAnsiChar(Result)), Length(Result) + SizeOf(WideChar),
- PAnsiChar(S), Length(S), [toInvalidCharToSymbol], L) <> trNoError
- then
- Result := ''
- else
- begin
- SetLength(Result, (L - 1) * SizeOf(WideChar));
- // Swap endian if needed
- {$IF DEFINED(ENDIAN_LITTLE)}
- P := PWord(PAnsiChar(Result));
- for I := 0 to SizeInt(L) - 1 do
- begin
- P[I] := SwapEndian(P[I]);
- end;
- {$ENDIF}
- end;
- end;
- function UTF8ToUCS4(const UTF8Text: String): UCS4String;
- var
- Len: PtrInt;
- Index: Integer;
- CharLen: Integer;
- SrcPos: PAnsiChar;
- begin
- Len:= Length(UTF8Text);
- SetLength(Result, Len);
- if Len = 0 then Exit;
- Index:= 0;
- SrcPos:= PAnsiChar(UTF8Text);
- while Len > 0 do begin
- Result[Index]:= UTF8CodepointToUnicode(SrcPos, CharLen);
- Inc(SrcPos, CharLen);
- Dec(Len, CharLen);
- Inc(Index);
- end;
- SetLength(Result, Index);
- end;
- function Utf8ReplaceBroken(const s: String): String;
- var
- Src, Dst, LastGoodPos: PByte;
- BytesLeft: Integer;
- InvalidCharLen: Integer;
- CharLen: Integer;
- begin
- if Length(s) = 0 then
- Exit(s);
- BytesLeft := Length(s);
- SetLength(Result, BytesLeft); // at most the same length
- Src := PByte(s);
- Dst := PByte(Result);
- LastGoodPos := Src;
- while BytesLeft > 0 do
- begin
- CharLen := SafeUTF8NextCharLen(Src, BytesLeft, InvalidCharLen);
- if InvalidCharLen > 0 then
- begin
- if LastGoodPos < Src then
- begin
- System.Move(LastGoodPos^, Dst^, Src - LastGoodPos);
- Inc(Dst, Src - LastGoodPos);
- end;
- Inc(Src, InvalidCharLen);
- Dec(BytesLeft, InvalidCharLen);
- LastGoodPos := Src;
- Dst^ := ord('?');
- Inc(Dst);
- end
- else
- begin
- Inc(Src, CharLen);
- Dec(BytesLeft, CharLen);
- end;
- end;
- if LastGoodPos = PByte(s) then
- Result := s // All characters are good.
- else
- begin
- if LastGoodPos < Src then
- begin
- System.Move(LastGoodPos^, Dst^, Src - LastGoodPos);
- Inc(Dst, Src - LastGoodPos);
- end;
- SetLength(Result, Dst - PByte(Result));
- end;
- end;
- procedure Utf8FixBroken(var S: String);
- var
- P: PAnsiChar;
- C, L: Integer;
- begin
- L:= Length(S);
- P:= Pointer(S);
- while (L > 0) do
- begin
- if Ord(P^) < %10000000 then begin
- // Regular single byte character
- C:= 1;
- end
- else if Ord(P^) < %11000000 then begin
- // Invalid character
- C:= 1;
- P^:= #26;
- end
- else if ((Ord(P^) and %11100000) = %11000000) then begin
- // Should be 2 byte character
- if (L > 1) and ((Ord(P[1]) and %11000000) = %10000000) then
- C:= 2
- else begin // Invalid character
- C:= 1;
- P^:= #26;
- end;
- end
- else if ((Ord(P^) and %11110000) = %11100000) then begin
- // Should be 3 byte character
- if (L > 2) and ((Ord(P[1]) and %11000000) = %10000000)
- and ((Ord(P[2]) and %11000000) = %10000000) then
- C:= 3
- else begin // Invalid character
- C:= 1;
- P^:= #26;
- end
- end
- else if ((Ord(P^) and %11111000) = %11110000) then begin
- // Should be 4 byte character
- if (L > 3) and ((Ord(P[1]) and %11000000) = %10000000)
- and ((Ord(P[2]) and %11000000) = %10000000)
- and ((Ord(P[3]) and %11000000) = %10000000) then
- C:= 4
- else begin // Invalid character
- C:= 1;
- P^:= #26;
- end
- end else begin // Invalid character
- C:= 1;
- P^:= #26;
- end;
- Dec(L, C);
- Inc(P, C);
- end;
- end;
- procedure Utf16SwapEndian(var S: UnicodeString);
- var
- P: PWord;
- I, L: Integer;
- begin
- L:= Length(S);
- P:= PWord(PWideChar(S));
- for I:= 0 to L - 1 do
- begin
- P[I]:= SwapEndian(P[I]);
- end;
- end;
- end.
|