dcunicodeutils.pas 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616
  1. {
  2. Most of this code is based on similar functions from Lazarus LCLProc.
  3. }
  4. unit DCUnicodeUtils;
  5. {$mode objfpc}{$H+}
  6. interface
  7. uses
  8. Classes, SysUtils;
  9. {en
  10. Retrieves length in bytes of the next UTF-8 character.
  11. @param(P
  12. Pointer to the UTF-8 characters.)
  13. @param(aMaxBytes
  14. States how many bytes from P can be read.)
  15. @param(InvalidCharLen
  16. If an invalid UTF-8 character was found then InvalidCharLen has
  17. the number of bytes this character spans. If the character was valid
  18. InvalidCharLen is zero.)
  19. }
  20. function SafeUTF8NextCharLen(P: PByte; aMaxBytes: IntPtr; out InvalidCharLen: Integer): Integer;
  21. {en
  22. Retrieves length in bytes of the previous UTF-8 character.
  23. It does not read from P, but rather from memory locations before P.
  24. @param(P
  25. Pointer to the UTF-8 characters.)
  26. @param(aMaxBytes
  27. States how many bytes from P *backwards* can be read.
  28. So, to safely read 3 bytes backwards ([p-1], [p-2], [p-3])
  29. this parameter should be at least 3.)
  30. @param(InvalidCharLen
  31. If an invalid UTF-8 character was found then InvalidCharLen has
  32. the number of bytes this character spans. If the character was valid
  33. InvalidCharLen is zero.)
  34. }
  35. function SafeUTF8PrevCharLen(P: PByte; aMaxBytes: IntPtr; out InvalidCharLen: Integer): Integer;
  36. function SafeUTF8NextCharStart(UTF8Str: PByte; Len: PtrInt): PByte;
  37. function SafeUTF8PrevCharEnd(UTF8Str: PByte; Len: PtrInt): PByte;
  38. {en
  39. Returns UTF-16 character length, which is either 1 or 2.
  40. @param(utf16char
  41. Any UTF-16 char or one of the surrogate pairs.)
  42. }
  43. function UTF16CharLen(utf16char: Word): Integer;
  44. {en
  45. Converts an UTF-16 surrogate pair into a unicode character.
  46. }
  47. function utf16PairToUnicode(u1, u2: Word): Cardinal;
  48. function Utf16LEToUtf8(const s: string): string; // UTF16-LE 2 or 4 byte little endian
  49. function Utf16BEToUtf8(const s: string): string; // UTF16-BE 2 or 4 byte big endian
  50. function Utf32LEToUtf8(const s: string): string; // UTF32-LE 4 byte little endian
  51. function Utf32BEToUtf8(const s: string): string; // UTF32-BE 4 byte big endian
  52. function Utf8ToUtf16LE(const s: string): string; // UTF16-LE 2 or 4 byte little endian
  53. function Utf8ToUtf16BE(const s: string): string; // UTF16-BE 2 or 4 byte big endian
  54. function UTF8ToUCS4(const UTF8Text: String): UCS4String;
  55. {en
  56. Replaces invalid UTF-8 characters with '?'.
  57. }
  58. function Utf8ReplaceBroken(const s: String): String;
  59. {en
  60. Replaces invalid UTF-8 characters with 0x1A (SUBSTITUTE).
  61. }
  62. procedure Utf8FixBroken(var S: String);
  63. procedure Utf16SwapEndian(var S: UnicodeString);
  64. implementation
  65. uses
  66. LazUTF8;
  67. const
  68. maxUTF8Len = 7; // really is 4, but this includes any invalid characters up to length 7
  69. function SafeUTF8NextCharLen(P: PByte; aMaxBytes: IntPtr; out InvalidCharLen: Integer): Integer;
  70. var
  71. BytesLen: Integer;
  72. i: Integer;
  73. begin
  74. if (p=nil) or (aMaxBytes = 0) then
  75. begin
  76. InvalidCharLen := 0;
  77. Result := 0;
  78. end
  79. else if p^<%10000000 then begin
  80. // regular single byte character
  81. InvalidCharLen := 0;
  82. Result := 1;
  83. end
  84. else if p^<%11000000 then begin
  85. // invalid single byte character
  86. InvalidCharLen := 1;
  87. Result := 1;
  88. end
  89. else
  90. begin
  91. // Read length of UTF-8 character in bytes.
  92. if ((p^ and %11100000) = %11000000) then BytesLen := 2
  93. else if ((p^ and %11110000) = %11100000) then BytesLen := 3
  94. else if ((p^ and %11111000) = %11110000) then BytesLen := 4
  95. else if ((p^ and %11111100) = %11111000) then BytesLen := 5
  96. else if ((p^ and %11111110) = %11111100) then BytesLen := 6
  97. else if ((p^ and %11111111) = %11111110) then BytesLen := 7
  98. else
  99. begin
  100. InvalidCharLen := 1;
  101. exit(1);
  102. end;
  103. // Check if the next bytes are from the middle of a character.
  104. for i := 1 to BytesLen - 1 do
  105. begin
  106. if (aMaxBytes < i) or ((p[i] and %11000000) <> %10000000) then
  107. begin
  108. InvalidCharLen := i;
  109. exit(1);
  110. end;
  111. end;
  112. InvalidCharLen := 0;
  113. Result := BytesLen;
  114. end;
  115. end;
  116. function SafeUTF8PrevCharLen(P: PByte; aMaxBytes: IntPtr; out InvalidCharLen: Integer): Integer;
  117. var
  118. BytesLen: Integer;
  119. signature: Byte;
  120. begin
  121. if (p=nil) or (aMaxBytes = 0) then
  122. begin
  123. InvalidCharLen := 0;
  124. Result := 0;
  125. end
  126. else if p[-1]<%10000000 then begin
  127. // regular single byte character
  128. InvalidCharLen := 0;
  129. Result := 1;
  130. end
  131. else
  132. begin
  133. for BytesLen := 1 to maxUTF8Len do
  134. begin
  135. if (aMaxBytes < BytesLen) then
  136. begin
  137. InvalidCharLen := aMaxBytes;
  138. exit(1);
  139. end;
  140. // Move past all the bytes in the middle of a character.
  141. if (p[-BytesLen] and %11000000) <> %10000000 then
  142. break;
  143. if BytesLen = maxUTF8Len then
  144. begin
  145. InvalidCharLen := BytesLen;
  146. exit(1);
  147. end;
  148. end;
  149. if p[-BytesLen]<%11000000 then
  150. begin
  151. // invalid first byte of a character
  152. InvalidCharLen := BytesLen;
  153. Result := 1;
  154. end
  155. else
  156. begin
  157. signature := Byte($FF shl (7 - BytesLen));
  158. if (p[-BytesLen] and signature) = Byte(signature shl 1) then
  159. begin
  160. // Correct first byte of a character.
  161. InvalidCharLen := 0;
  162. Result := BytesLen;
  163. end
  164. else
  165. begin
  166. // Invalid first byte of a character, or p is in the middle of a character.
  167. InvalidCharLen := BytesLen;
  168. Result := 1;
  169. end;
  170. end;
  171. end;
  172. end;
  173. function SafeUTF8NextCharStart(UTF8Str: PByte; Len: PtrInt): PByte;
  174. var
  175. CharLen: LongInt;
  176. InvalidCharLen: Integer;
  177. begin
  178. Result:=UTF8Str;
  179. if Result<>nil then begin
  180. while (Len>0) do begin
  181. CharLen := SafeUTF8NextCharLen(Result, Len, InvalidCharLen);
  182. if InvalidCharLen > 0 then
  183. begin
  184. dec(Len,InvalidCharLen);
  185. inc(Result,InvalidCharLen);
  186. end
  187. else if CharLen = 0 then
  188. exit(nil)
  189. else
  190. exit(Result);
  191. end;
  192. Result:=nil;
  193. end;
  194. end;
  195. function SafeUTF8PrevCharEnd(UTF8Str: PByte; Len: PtrInt): PByte;
  196. var
  197. CharLen: LongInt;
  198. InvalidCharLen: Integer;
  199. begin
  200. Result:=UTF8Str;
  201. if Result<>nil then begin
  202. while (Len>0) do begin
  203. CharLen := SafeUTF8PrevCharLen(Result, Len, InvalidCharLen);
  204. if InvalidCharLen > 0 then
  205. begin
  206. dec(Len,InvalidCharLen);
  207. dec(Result,InvalidCharLen);
  208. end
  209. else if CharLen = 0 then
  210. exit(nil)
  211. else
  212. exit(Result); // Result is the character beginning
  213. end;
  214. Result:=nil;
  215. end;
  216. end;
  217. function UTF16CharLen(utf16char: Word): Integer; inline;
  218. begin
  219. if (utf16char < $D800) or (utf16char > $DFFF) then
  220. Result := 1
  221. else
  222. Result := 2;
  223. end;
  224. function utf16PairToUnicode(u1, u2: Word): Cardinal;
  225. begin
  226. if (u1 >= $D800) and (u1 <= $DBFF) then
  227. begin
  228. if (u2 >= $DC00) and (u2 <= $DFFF) then
  229. Result := (Cardinal(u1 - $D800) shl 10) + Cardinal(u2 - $DC00) + $10000
  230. else
  231. Result := 0;
  232. end
  233. else
  234. Result := u1;
  235. end;
  236. function Utf16LEToUtf8(const s: string): string;
  237. var
  238. len: Integer;
  239. Src, Limit: PWord;
  240. Dest: PAnsiChar;
  241. u: Cardinal;
  242. begin
  243. if Length(s) < 2 then begin
  244. Result:='';
  245. exit;
  246. end;
  247. Src:=PWord(Pointer(s));
  248. Limit := PWord(Pointer(Src) + Length(s));
  249. SetLength(Result, length(s) * 2);
  250. Dest:=PAnsiChar(Result);
  251. while Src + 1 <= Limit do begin
  252. len := UTF16CharLen(Src^);
  253. if len = 1 then
  254. u := LEtoN(Src^)
  255. else
  256. begin
  257. if Src + 2 <= Limit then
  258. u := utf16PairToUnicode(LEtoN(Src[0]), LEtoN(Src[1]))
  259. else
  260. break;
  261. end;
  262. inc(Src, len);
  263. if u<128 then begin
  264. Dest^:=chr(u);
  265. inc(Dest);
  266. end else begin
  267. inc(Dest,UnicodeToUTF8SkipErrors(u,Dest));
  268. end;
  269. end;
  270. len:=PtrUInt(Dest)-PtrUInt(Result);
  271. Assert(len <= length(Result));
  272. SetLength(Result,len);
  273. end;
  274. function Utf16BEToUtf8(const s: string): string;
  275. var
  276. len: Integer;
  277. Src, Limit: PWord;
  278. Dest: PAnsiChar;
  279. u: Cardinal;
  280. begin
  281. if Length(s) < 2 then begin
  282. Result:='';
  283. exit;
  284. end;
  285. Src:=PWord(Pointer(s));
  286. Limit := PWord(Pointer(Src) + Length(s));
  287. SetLength(Result, length(s) * 2);
  288. Dest:=PAnsiChar(Result);
  289. while Src + 1 <= Limit do begin
  290. len := UTF16CharLen(swap(Src^));
  291. if len = 1 then
  292. u := BEtoN(Src^)
  293. else
  294. begin
  295. if Src + 2 <= Limit then
  296. u := utf16PairToUnicode(BEtoN(Src[0]), BEtoN(Src[1]))
  297. else
  298. break;
  299. end;
  300. inc(Src, len);
  301. if u<128 then begin
  302. Dest^:=chr(u);
  303. inc(Dest);
  304. end else begin
  305. inc(Dest,UnicodeToUTF8SkipErrors(u,Dest));
  306. end;
  307. end;
  308. len:=PtrUInt(Dest)-PtrUInt(Result);
  309. Assert(len <= length(Result));
  310. SetLength(Result,len);
  311. end;
  312. function Utf32LEToUtf8(const s: string): string;
  313. var
  314. len: Integer;
  315. Src: PLongWord;
  316. Dest: PAnsiChar;
  317. i: Integer;
  318. c: LongWord;
  319. begin
  320. if Length(s) < 4 then begin
  321. Result:='';
  322. exit;
  323. end;
  324. len:=length(s) div 4;
  325. SetLength(Result,len*4);
  326. Src:=PLongWord(Pointer(s));
  327. Dest:=PAnsiChar(Result);
  328. for i:=1 to len do begin
  329. c:=LEtoN(Src^);
  330. inc(Src);
  331. if c<128 then begin
  332. Dest^:=chr(c);
  333. inc(Dest);
  334. end else begin
  335. inc(Dest,UnicodeToUTF8SkipErrors(c,Dest));
  336. end;
  337. end;
  338. len:=PtrUInt(Dest)-PtrUInt(Result);
  339. Assert(len <= length(Result));
  340. SetLength(Result,len);
  341. end;
  342. function Utf32BEToUtf8(const s: string): string;
  343. var
  344. len: Integer;
  345. Src: PLongWord;
  346. Dest: PAnsiChar;
  347. i: Integer;
  348. c: LongWord;
  349. begin
  350. if Length(s) < 4 then begin
  351. Result:='';
  352. exit;
  353. end;
  354. len:=length(s) div 4;
  355. SetLength(Result,len*4);
  356. Src:=PLongWord(Pointer(s));
  357. Dest:=PAnsiChar(Result);
  358. for i:=1 to len do begin
  359. c:=BEtoN(Src^);
  360. inc(Src);
  361. if c<128 then begin
  362. Dest^:=chr(c);
  363. inc(Dest);
  364. end else begin
  365. inc(Dest,UnicodeToUTF8SkipErrors(c,Dest));
  366. end;
  367. end;
  368. len:=PtrUInt(Dest)-PtrUInt(Result);
  369. Assert(len <= length(Result));
  370. SetLength(Result,len);
  371. end;
  372. function Utf8ToUtf16LE(const s: string): string;
  373. var
  374. L: SizeUInt;
  375. {$IF DEFINED(ENDIAN_BIG)}
  376. P: PWord;
  377. I: SizeInt;
  378. {$ENDIF}
  379. begin
  380. if Length(S) = 0 then
  381. begin
  382. Result := '';
  383. Exit;
  384. end;
  385. // Wide chars of UTF-16 <= bytes of UTF-8 string
  386. SetLength(Result, Length(S) * SizeOf(WideChar));
  387. if ConvertUTF8ToUTF16(PWideChar(PAnsiChar(Result)), Length(Result) + SizeOf(WideChar),
  388. PAnsiChar(S), Length(S), [toInvalidCharToSymbol], L) <> trNoError
  389. then
  390. Result := ''
  391. else
  392. begin
  393. SetLength(Result, (L - 1) * SizeOf(WideChar));
  394. // Swap endian if needed
  395. {$IF DEFINED(ENDIAN_BIG)}
  396. P := PWord(PAnsiChar(Result));
  397. for I := 0 to SizeInt(L) - 1 do
  398. begin
  399. P[I] := SwapEndian(P[I]);
  400. end;
  401. {$ENDIF}
  402. end;
  403. end;
  404. function Utf8ToUtf16BE(const s: string): string;
  405. var
  406. L: SizeUInt;
  407. {$IF DEFINED(ENDIAN_LITTLE)}
  408. P: PWord;
  409. I: SizeInt;
  410. {$ENDIF}
  411. begin
  412. if Length(S) = 0 then
  413. begin
  414. Result := '';
  415. Exit;
  416. end;
  417. // Wide chars of UTF-16 <= bytes of UTF-8 string
  418. SetLength(Result, Length(S) * SizeOf(WideChar));
  419. if ConvertUTF8ToUTF16(PWideChar(PAnsiChar(Result)), Length(Result) + SizeOf(WideChar),
  420. PAnsiChar(S), Length(S), [toInvalidCharToSymbol], L) <> trNoError
  421. then
  422. Result := ''
  423. else
  424. begin
  425. SetLength(Result, (L - 1) * SizeOf(WideChar));
  426. // Swap endian if needed
  427. {$IF DEFINED(ENDIAN_LITTLE)}
  428. P := PWord(PAnsiChar(Result));
  429. for I := 0 to SizeInt(L) - 1 do
  430. begin
  431. P[I] := SwapEndian(P[I]);
  432. end;
  433. {$ENDIF}
  434. end;
  435. end;
  436. function UTF8ToUCS4(const UTF8Text: String): UCS4String;
  437. var
  438. Len: PtrInt;
  439. Index: Integer;
  440. CharLen: Integer;
  441. SrcPos: PAnsiChar;
  442. begin
  443. Len:= Length(UTF8Text);
  444. SetLength(Result, Len);
  445. if Len = 0 then Exit;
  446. Index:= 0;
  447. SrcPos:= PAnsiChar(UTF8Text);
  448. while Len > 0 do begin
  449. Result[Index]:= UTF8CodepointToUnicode(SrcPos, CharLen);
  450. Inc(SrcPos, CharLen);
  451. Dec(Len, CharLen);
  452. Inc(Index);
  453. end;
  454. SetLength(Result, Index);
  455. end;
  456. function Utf8ReplaceBroken(const s: String): String;
  457. var
  458. Src, Dst, LastGoodPos: PByte;
  459. BytesLeft: Integer;
  460. InvalidCharLen: Integer;
  461. CharLen: Integer;
  462. begin
  463. if Length(s) = 0 then
  464. Exit(s);
  465. BytesLeft := Length(s);
  466. SetLength(Result, BytesLeft); // at most the same length
  467. Src := PByte(s);
  468. Dst := PByte(Result);
  469. LastGoodPos := Src;
  470. while BytesLeft > 0 do
  471. begin
  472. CharLen := SafeUTF8NextCharLen(Src, BytesLeft, InvalidCharLen);
  473. if InvalidCharLen > 0 then
  474. begin
  475. if LastGoodPos < Src then
  476. begin
  477. System.Move(LastGoodPos^, Dst^, Src - LastGoodPos);
  478. Inc(Dst, Src - LastGoodPos);
  479. end;
  480. Inc(Src, InvalidCharLen);
  481. Dec(BytesLeft, InvalidCharLen);
  482. LastGoodPos := Src;
  483. Dst^ := ord('?');
  484. Inc(Dst);
  485. end
  486. else
  487. begin
  488. Inc(Src, CharLen);
  489. Dec(BytesLeft, CharLen);
  490. end;
  491. end;
  492. if LastGoodPos = PByte(s) then
  493. Result := s // All characters are good.
  494. else
  495. begin
  496. if LastGoodPos < Src then
  497. begin
  498. System.Move(LastGoodPos^, Dst^, Src - LastGoodPos);
  499. Inc(Dst, Src - LastGoodPos);
  500. end;
  501. SetLength(Result, Dst - PByte(Result));
  502. end;
  503. end;
  504. procedure Utf8FixBroken(var S: String);
  505. var
  506. P: PAnsiChar;
  507. C, L: Integer;
  508. begin
  509. L:= Length(S);
  510. P:= Pointer(S);
  511. while (L > 0) do
  512. begin
  513. if Ord(P^) < %10000000 then begin
  514. // Regular single byte character
  515. C:= 1;
  516. end
  517. else if Ord(P^) < %11000000 then begin
  518. // Invalid character
  519. C:= 1;
  520. P^:= #26;
  521. end
  522. else if ((Ord(P^) and %11100000) = %11000000) then begin
  523. // Should be 2 byte character
  524. if (L > 1) and ((Ord(P[1]) and %11000000) = %10000000) then
  525. C:= 2
  526. else begin // Invalid character
  527. C:= 1;
  528. P^:= #26;
  529. end;
  530. end
  531. else if ((Ord(P^) and %11110000) = %11100000) then begin
  532. // Should be 3 byte character
  533. if (L > 2) and ((Ord(P[1]) and %11000000) = %10000000)
  534. and ((Ord(P[2]) and %11000000) = %10000000) then
  535. C:= 3
  536. else begin // Invalid character
  537. C:= 1;
  538. P^:= #26;
  539. end
  540. end
  541. else if ((Ord(P^) and %11111000) = %11110000) then begin
  542. // Should be 4 byte character
  543. if (L > 3) and ((Ord(P[1]) and %11000000) = %10000000)
  544. and ((Ord(P[2]) and %11000000) = %10000000)
  545. and ((Ord(P[3]) and %11000000) = %10000000) then
  546. C:= 4
  547. else begin // Invalid character
  548. C:= 1;
  549. P^:= #26;
  550. end
  551. end else begin // Invalid character
  552. C:= 1;
  553. P^:= #26;
  554. end;
  555. Dec(L, C);
  556. Inc(P, C);
  557. end;
  558. end;
  559. procedure Utf16SwapEndian(var S: UnicodeString);
  560. var
  561. P: PWord;
  562. I, L: Integer;
  563. begin
  564. L:= Length(S);
  565. P:= PWord(PWideChar(S));
  566. for I:= 0 to L - 1 do
  567. begin
  568. P[I]:= SwapEndian(P[I]);
  569. end;
  570. end;
  571. end.