dcconvertencoding.pas 17 KB


  1. unit DCConvertEncoding;
  2. {$mode objfpc}{$H+}
  3. {$IF DEFINED(DARWIN)}
  4. {$modeswitch objectivec1}
  5. {$ENDIF}
  6. interface
  7. uses
  8. Classes, SysUtils;
  9. {$IF NOT DECLARED(RawByteString)}
  10. type
  11. RawByteString = AnsiString;
  12. {$IFEND}
  13. var
  14. {en
  15. Convert from OEM to System encoding, if needed
  16. }
  17. CeOemToSys: function (const Source: String): RawByteString;
  18. CeSysToOem: function (const Source: String): RawByteString;
  19. {en
  20. Convert from OEM to UTF-8 encoding, if needed
  21. }
  22. CeOemToUtf8: function (const Source: String): RawByteString;
  23. CeUtf8ToOem: function (const Source: String): RawByteString;
  24. {en
  25. Convert from Ansi to System encoding, if needed
  26. }
  27. CeAnsiToSys: function (const Source: String): RawByteString;
  28. CeSysToAnsi: function (const Source: String): RawByteString;
  29. {en
  30. Convert from ANSI to UTF-8 encoding, if needed
  31. }
  32. CeAnsiToUtf8: function (const Source: String): RawByteString;
  33. CeUtf8ToAnsi: function (const Source: String): RawByteString;
  34. {en
  35. Convert from Utf8 to System encoding, if needed
  36. }
  37. CeUtf8ToSys: function (const Source: String): RawByteString;
  38. CeSysToUtf8: function (const Source: String): RawByteString;
  39. function CeRawToUtf8(const Source: String): RawByteString;
  40. function CeUtf8ToUtf16(const Source: String): UnicodeString;
  41. function CeUtf16ToUtf8(const Source: UnicodeString): RawByteString;
  42. {$IF DEFINED(MSWINDOWS)}
  43. function CeTryEncode(const aValue: UnicodeString; aCodePage: Cardinal;
  44. aAllowBestFit: Boolean; out aResult: AnsiString): Boolean;
  45. function CeTryDecode(const aValue: AnsiString; aCodePage: Cardinal;
  46. out aResult: UnicodeString): Boolean;
  47. {$ELSEIF DEFINED(UNIX)}
  48. var
  49. SystemEncodingUtf8: Boolean = False;
  50. SystemEncoding, SystemLocale: String;
  51. {$ENDIF}
  52. var
  53. SystemLanguage: String;
  54. implementation
  55. uses
  56. {$IF DEFINED(UNIX)}
  57. LazUTF8
  58. {$IF DEFINED(DARWIN)}
  59. , dc_iconvenc_dyn, MacOSAll, CocoaAll, StrUtils
  60. {$ELSE}
  61. , iconvenc_dyn, UnixCP
  62. {$ENDIF}
  63. {$ELSEIF DEFINED(MSWINDOWS)}
  64. Windows
  65. {$ENDIF}
  66. ;
  67. {$IF DEFINED(FPC_HAS_CPSTRING)}
  68. var
  69. FileSystemCodePage: TSystemCodePage;
  70. {$ENDIF}
  71. function UTF8CharacterStrictLength(P: PAnsiChar): integer;
  72. begin
  73. if p=nil then exit(0);
  74. if ord(p^)<%10000000 then begin
  75. // regular single byte character
  76. exit(1);
  77. end
  78. else if ord(p^)<%11000000 then begin
  79. // invalid single byte character
  80. exit(0);
  81. end
  82. else if ((ord(p^) and %11100000) = %11000000) then begin
  83. // should be 2 byte character
  84. if (ord(p[1]) and %11000000) = %10000000 then
  85. exit(2)
  86. else
  87. exit(0);
  88. end
  89. else if ((ord(p^) and %11110000) = %11100000) then begin
  90. // should be 3 byte character
  91. if ((ord(p[1]) and %11000000) = %10000000)
  92. and ((ord(p[2]) and %11000000) = %10000000) then
  93. exit(3)
  94. else
  95. exit(0);
  96. end
  97. else if ((ord(p^) and %11111000) = %11110000) then begin
  98. // should be 4 byte character
  99. if ((ord(p[1]) and %11000000) = %10000000)
  100. and ((ord(p[2]) and %11000000) = %10000000)
  101. and ((ord(p[3]) and %11000000) = %10000000) then
  102. exit(4)
  103. else
  104. exit(0);
  105. end else
  106. exit(0);
  107. end;
  108. function CeRawToUtf8(const Source: String): RawByteString;
  109. var
  110. P: PAnsiChar;
  111. I, L: LongInt;
  112. begin
  113. L:= Length(Source);
  114. // Try UTF-8 (this includes ASCII)
  115. P:= PAnsiChar(Source);
  116. repeat
  117. if Ord(P^) < 128 then begin
  118. // ASCII
  119. if (P^ = #0) and (P - PAnsiChar(Source) >= L) then begin
  120. Result:= Source;
  121. Exit;
  122. end;
  123. Inc(P);
  124. end else begin
  125. I:= UTF8CharacterStrictLength(P);
  126. if I = 0 then Break;
  127. Inc(P, I);
  128. end;
  129. until False;
  130. Result:= CeSysToUtf8(Source);
  131. end;
  132. function CeUtf8ToUtf16(const Source: String): UnicodeString;
  133. {$IF DEFINED(MSWINDOWS)}
  134. var
  135. L: SizeUInt;
  136. begin
  137. L:= Length(Source);
  138. if L = 0 then Exit('');
  139. SetLength(Result, L + 1);
  140. // wide chars of UTF-16 <= bytes of UTF-8 string
  141. SetLength(Result, MultiByteToWideChar(CP_UTF8, 0, PAnsiChar(Source), L, PWideChar(Result), L + 1));
  142. end;
  143. {$ELSE}
  144. var
  145. L: SizeUInt;
  146. begin
  147. L:= Length(Source);
  148. if L = 0 then Exit('');
  149. SetLength(Result, L + 1);
  150. if (ConvertUTF8ToUTF16(PUnicodeChar(Result), L + 1, PAnsiChar(Source), L,
  151. [toInvalidCharToSymbol], L) = trNoError) then
  152. begin
  153. SetLength(Result, L - 1);
  154. end
  155. else begin
  156. SetLength(Result, 0);
  157. end;
  158. end;
  159. {$ENDIF}
  160. function CeUtf16ToUtf8(const Source: UnicodeString): RawByteString;
  161. {$IF DEFINED(MSWINDOWS)}
  162. var
  163. L: SizeUInt;
  164. begin
  165. L:= Length(Source);
  166. if (L = 0) then Exit('');
  167. SetLength(Result, L * 3);
  168. // bytes of UTF-8 <= 3 * wide chars of UTF-16 string
  169. // e.g. %11100000 10100000 10000000 (UTF-8) is $0800 (UTF-16)
  170. SetLength(Result, WideCharToMultiByte(CP_UTF8, 0,
  171. PWideChar(Source), L, PAnsiChar(Result), Length(Result), nil, nil));
  172. end;
  173. {$ELSE}
  174. var
  175. L: SizeUInt;
  176. begin
  177. L:= Length(Source);
  178. if (L = 0) then Exit('');
  179. SetLength(Result, L * 3);
  180. if (ConvertUTF16ToUTF8(PAnsiChar(Result), Length(Result) + 1, PUnicodeChar(Source), L,
  181. [toInvalidCharToSymbol], L) = trNoError) then
  182. begin
  183. SetLength(Result, L - 1);
  184. end
  185. else begin
  186. SetLength(Result, 0);
  187. end;
  188. end;
  189. {$ENDIF}
  190. function Dummy(const Source: String): RawByteString;
  191. begin
  192. Result:= Source;
  193. end;
  194. {$IF DEFINED(FPC_HAS_CPSTRING)}
  195. function Sys2UTF8(const Source: String): RawByteString;
  196. begin
  197. Result:= Source;
  198. SetCodePage(Result, FileSystemCodePage, False);
  199. SetCodePage(Result, CP_UTF8, True);
  200. // Prevent another codepage appear in the strings
  201. // we don't need codepage conversion magic in our code
  202. SetCodePage(Result, DefaultSystemCodePage, False);
  203. end;
  204. function UTF82Sys(const Source: String): RawByteString;
  205. begin
  206. Result:= Source;
  207. SetCodePage(Result, CP_UTF8, False);
  208. SetCodePage(Result, FileSystemCodePage, True);
  209. // Prevent another codepage appear in the strings
  210. // we don't need codepage conversion magic in our code
  211. SetCodePage(Result, DefaultSystemCodePage, False);
  212. end;
  213. {$ELSE}
  214. function Sys2UTF8(const Source: String): RawByteString;
  215. begin
  216. Result:= UTF8Encode(Source);
  217. end;
  218. function UTF82Sys(const Source: String): RawByteString;
  219. begin
  220. Result:= UTF8Decode(Source);
  221. end;
  222. {$ENDIF}
  223. {$IF DEFINED(MSWINDOWS)}
  224. function CeTryEncode(const aValue: UnicodeString; aCodePage: Cardinal;
  225. aAllowBestFit: Boolean; out aResult: AnsiString): Boolean;
  226. // Try to encode the given Unicode string as the requested codepage
  227. const
  228. WC_NO_BEST_FIT_CHARS = $00000400;
  229. Flags: array[Boolean] of DWORD = (WC_NO_BEST_FIT_CHARS, 0);
  230. var
  231. UsedDefault: BOOL;
  232. begin
  233. if not aAllowBestFit and not CheckWin32Version(4, 1) then
  234. Result := False
  235. else begin
  236. SetLength(aResult, WideCharToMultiByte(aCodePage, Flags[aAllowBestFit],
  237. PWideChar(aValue), Length(aValue), nil, 0, nil, @UsedDefault));
  238. SetLength(aResult, WideCharToMultiByte(aCodePage, Flags[aAllowBestFit],
  239. PWideChar(aValue), Length(aValue), PAnsiChar(aResult),
  240. Length(aResult), nil, @UsedDefault));
  241. Result := not UsedDefault;
  242. end;
  243. end;
  244. function CeTryDecode(const aValue: AnsiString; aCodePage: Cardinal;
  245. out aResult: UnicodeString): Boolean;
  246. begin
  247. SetLength(aResult, MultiByteToWideChar(aCodePage, MB_ERR_INVALID_CHARS,
  248. LPCSTR(aValue), Length(aValue), nil, 0) * SizeOf(UnicodeChar));
  249. SetLength(aResult, MultiByteToWideChar(aCodePage, MB_ERR_INVALID_CHARS,
  250. LPCSTR(aValue), Length(aValue), PWideChar(aResult), Length(aResult)));
  251. Result := Length(aResult) > 0;
  252. end;
  253. function Oem2Utf8(const Source: String): RawByteString;
  254. var
  255. UnicodeResult: UnicodeString;
  256. begin
  257. if CeTryDecode(Source, CP_OEMCP, UnicodeResult) then
  258. Result:= CeUtf16ToUtf8(UnicodeResult)
  259. else
  260. Result:= Source;
  261. end;
  262. function Utf82Oem(const Source: String): RawByteString;
  263. var
  264. AnsiResult: AnsiString;
  265. begin
  266. if CeTryEncode(CeUtf8ToUtf16(Source), CP_OEMCP, False, AnsiResult) then
  267. Result:= AnsiResult
  268. else
  269. Result:= Source;
  270. end;
  271. function OEM2Ansi(const Source: String): RawByteString;
  272. var
  273. Dst: PAnsiChar;
  274. begin
  275. Result:= Source;
  276. Dst:= AllocMem((Length(Result) + 1) * SizeOf(AnsiChar));
  277. if OEMToChar(PAnsiChar(Result), Dst) then
  278. Result:= StrPas(Dst);
  279. FreeMem(Dst);
  280. end;
  281. function Ansi2OEM(const Source: String): RawByteString;
  282. var
  283. Dst: PAnsiChar;
  284. begin
  285. Result := Source;
  286. Dst := AllocMem((Length(Result) + 1) * SizeOf(AnsiChar));
  287. if CharToOEM(PAnsiChar(Result), Dst) then
  288. Result := StrPas(Dst);
  289. FreeMem(Dst);
  290. end;
  291. procedure Initialize;
  292. var
  293. Buffer: array[1..4] of AnsiChar;
  294. begin
  295. CeOemToSys:= @OEM2Ansi;
  296. CeSysToOem:= @Ansi2OEM;
  297. CeOemToUtf8:= @Oem2Utf8;
  298. CeUtf8ToOem:= @Utf82Oem;
  299. CeAnsiToSys:= @Dummy;
  300. CeSysToAnsi:= @Dummy;
  301. CeAnsiToUtf8:= @Sys2UTF8;
  302. CeUtf8ToAnsi:= @UTF82Sys;
  303. CeSysToUtf8:= @Sys2UTF8;
  304. CeUtf8ToSys:= @UTF82Sys;
  305. if GetLocaleInfo(GetUserDefaultLCID, LOCALE_SABBREVLANGNAME, @Buffer[1], 4) > 0 then
  306. SystemLanguage := LowerCase(Copy(Buffer, 1, 2));
  307. end;
  308. {$ELSEIF DEFINED(UNIX)}
  309. {$I dcconvertencoding.inc}
  310. const
  311. EncodingUTF8 = 'UTF-8'; // UTF-8 Encoding
  312. var
  313. EncodingOEM, // OEM Encoding
  314. EncodingANSI: String; // ANSI Encoding
  315. function GetSystemEncoding: Boolean;
  316. {$IF DEFINED(DARWIN)}
  317. var
  318. Country: String;
  319. CurrentLocale: NSLocale;
  320. LanguageCFRef: CFStringRef = nil;
  321. LanguageCFArray: CFArrayRef = nil;
  322. begin
  323. // System encoding
  324. SystemEncoding:= EncodingUTF8;
  325. // Get system language
  326. LanguageCFArray:= CFLocaleCopyPreferredLanguages;
  327. try
  328. Result:= CFArrayGetCount(LanguageCFArray) > 0;
  329. if Result then
  330. begin
  331. LanguageCFRef:= CFArrayGetValueAtIndex(LanguageCFArray, 0);
  332. SetLength(SystemLanguage, MAX_PATH);
  333. Result:= CFStringGetCString(LanguageCFRef,
  334. PAnsiChar(SystemLanguage),
  335. MAX_PATH,
  336. kCFStringEncodingUTF8
  337. );
  338. end;
  339. finally
  340. CFRelease(LanguageCFArray);
  341. end;
  342. if Result then
  343. begin
  344. // Crop to terminating zero
  345. SystemLanguage:= PAnsiChar(SystemLanguage);
  346. SystemLanguage:= Copy2Symb(SystemLanguage, '-');
  347. // Get system country
  348. CurrentLocale:= NSLocale.currentLocale();
  349. Country:= NSString(CurrentLocale.objectForKey(NSLocaleCountryCode)).UTF8String;
  350. // Combine system locale
  351. if (Length(SystemLanguage) > 0) and (Length(Country) > 0) then
  352. begin
  353. SystemLocale:= SystemLanguage + '_' + Country;
  354. end;
  355. end;
  356. end;
  357. {$ELSE}
  358. var
  359. I: Integer;
  360. Lang: String;
  361. begin
  362. Result:= True;
  363. Lang:= SysUtils.GetEnvironmentVariable('LC_ALL');
  364. if Length(Lang) = 0 then
  365. begin
  366. Lang:= SysUtils.GetEnvironmentVariable('LC_CTYPE');
  367. if Length(Lang) = 0 then
  368. begin
  369. Lang:= SysUtils.GetEnvironmentVariable('LANG');
  370. if Length(Lang) = 0 then
  371. Exit(False);
  372. end;
  373. end;
  374. I:= Pos('_', Lang);
  375. if (I = 0) then
  376. SystemLanguage:= Lang
  377. else begin
  378. SystemLanguage:= Copy(Lang, 1, I - 1);
  379. end;
  380. I:= System.Pos('.', Lang);
  381. if (I > 0) then
  382. begin
  383. SystemLocale:= Copy(Lang, 1, I - 1);
  384. SystemEncoding:= Copy(Lang, I + 1, Length(Lang) - I);
  385. end
  386. else begin
  387. SystemLocale:= Lang;
  388. SystemEncoding:= EncodingUTF8;
  389. end;
  390. end;
  391. {$ENDIF}
  392. {$IF DEFINED(DARWIN)}
  393. function InitIconv(var Error: String): Boolean;
  394. begin
  395. Error:= EmptyStr;
  396. Result:= TryLoadLib('libiconv.dylib', Error);
  397. IconvLibFound:= IconvLibFound or Result;
  398. end;
  399. {$ELSEIF DEFINED(FPC_HAS_CPSTRING)}
  400. var
  401. AManager : TUnicodeStringManager;
  402. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  403. begin
  404. Result:= UnixCP.GetSystemCodepage;
  405. end;
  406. procedure SetStdIOCodePage(var T: Text); inline;
  407. begin
  408. case TextRec(T).Mode of
  409. fmInput: TextRec(T).CodePage:= GetStandardCodePage(scpConsoleInput);
  410. fmOutput: TextRec(T).CodePage:= GetStandardCodePage(scpConsoleOutput);
  411. end;
  412. end;
  413. procedure SetStdIOCodePages; inline;
  414. begin
  415. SetStdIOCodePage(Input);
  416. SetStdIOCodePage(Output);
  417. SetStdIOCodePage(ErrOutput);
  418. SetStdIOCodePage(StdOut);
  419. SetStdIOCodePage(StdErr);
  420. end;
  421. {$ENDIF}
  422. function FindEncoding: Boolean;
  423. var
  424. Index: Integer;
  425. begin
  426. // Try to find by language and country
  427. for Index:= Low(charset_relation) to High(charset_relation) do
  428. begin
  429. if CompareStr(charset_relation[Index, 1], SystemLocale) = 0 then
  430. begin
  431. EncodingANSI:= charset_relation[Index, 2];
  432. EncodingOEM:= charset_relation[Index, 3];
  433. Exit(True);
  434. end;
  435. end;
  436. // Try to find by language only
  437. for Index:= Low(charset_relation) to High(charset_relation) do
  438. begin
  439. if CompareStr(charset_relation[Index, 0], SystemLanguage) = 0 then
  440. begin
  441. EncodingANSI:= charset_relation[Index, 2];
  442. EncodingOEM:= charset_relation[Index, 3];
  443. Exit(True);
  444. end;
  445. end;
  446. Result:= False;
  447. end;
  448. function Oem2Utf8(const Source: String): RawByteString;
  449. begin
  450. Result:= Source;
  451. Iconvert(Source, String(Result), EncodingOEM, EncodingUTF8);
  452. end;
  453. function Utf82Oem(const Source: String): RawByteString;
  454. begin
  455. Result:= Source;
  456. Iconvert(Source, String(Result), EncodingUTF8, EncodingOEM);
  457. end;
  458. function OEM2Sys(const Source: String): RawByteString;
  459. begin
  460. Result:= Source;
  461. Iconvert(Source, String(Result), EncodingOEM, SystemEncoding);
  462. end;
  463. function Sys2OEM(const Source: String): RawByteString;
  464. begin
  465. Result:= Source;
  466. Iconvert(Source, String(Result), SystemEncoding, EncodingOEM);
  467. end;
  468. function Ansi2Sys(const Source: String): RawByteString;
  469. begin
  470. Result:= Source;
  471. Iconvert(Source, String(Result), EncodingANSI, SystemEncoding);
  472. end;
  473. function Sys2Ansi(const Source: String): RawByteString;
  474. begin
  475. Result:= Source;
  476. Iconvert(Source, String(Result), SystemEncoding, EncodingANSI);
  477. end;
  478. function Ansi2Utf8(const Source: String): RawByteString;
  479. begin
  480. Result:= Source;
  481. Iconvert(Source, String(Result), EncodingANSI, EncodingUTF8);
  482. end;
  483. function Utf82Ansi(const Source: String): RawByteString;
  484. begin
  485. Result:= Source;
  486. Iconvert(Source, String(Result), EncodingUTF8, EncodingANSI);
  487. end;
  488. procedure Initialize;
  489. var
  490. Error: String = '';
  491. begin
  492. CeOemToSys:= @Dummy;
  493. CeSysToOem:= @Dummy;
  494. CeOemToUtf8:= @Dummy;
  495. CeUtf8ToOem:= @Dummy;
  496. CeAnsiToSys:= @Dummy;
  497. CeSysToAnsi:= @Dummy;
  498. CeUtf8ToSys:= @Dummy;
  499. CeSysToUtf8:= @Dummy;
  500. CeAnsiToUtf8:= @Dummy;
  501. CeUtf8ToAnsi:= @Dummy;
  502. {$IF DEFINED(FPC_HAS_CPSTRING) and NOT DEFINED(DARWIN)}
  503. {
  504. If locale does not exists then nl_langinfo (called by cwstring unit)
  505. returns ANSI_X3.4-1968 (CP_ASCII) as system encoding. Try to find correct
  506. encoding by using environment variables LC_ALL, LC_CTYPE, LANG in this case.
  507. }
  508. if DefaultFileSystemCodePage = CP_ASCII then
  509. begin
  510. DefaultFileSystemCodePage:= UnixCP.GetSystemCodepage;
  511. // Use CP_UTF8 if cannot determine system encoding
  512. if DefaultFileSystemCodePage = CP_ASCII then
  513. DefaultFileSystemCodePage:= CP_UTF8
  514. else begin
  515. GetWideStringManager(AManager);
  516. AManager.GetStandardCodePageProc:= @GetStandardCodePage;
  517. SetWideStringManager(AManager);
  518. end;
  519. SetStdIOCodePages;
  520. FileSystemCodePage:= DefaultFileSystemCodePage;
  521. DefaultSystemCodePage:= DefaultFileSystemCodePage;
  522. DefaultRTLFileSystemCodePage:= DefaultFileSystemCodePage;
  523. end;
  524. {$ENDIF}
  525. // Try to get system encoding and initialize Iconv library
  526. if not (GetSystemEncoding and InitIconv(Error)) then
  527. WriteLn(Error)
  528. else
  529. begin
  530. SystemEncodingUtf8:= (SysUtils.CompareText(SystemEncoding, 'UTF-8') = 0) or
  531. (SysUtils.CompareText(SystemEncoding, 'UTF8') = 0);
  532. if FindEncoding then
  533. begin
  534. if (Length(EncodingOEM) > 0) then
  535. begin
  536. CeOemToSys:= @OEM2Sys;
  537. CeSysToOem:= @Sys2OEM;
  538. CeOemToUtf8:= @Oem2Utf8;
  539. CeUtf8ToOem:= @Utf82Oem;
  540. end;
  541. if (Length(EncodingANSI) > 0) then
  542. begin
  543. CeAnsiToSys:= @Ansi2Sys;
  544. CeSysToAnsi:= @Sys2Ansi;
  545. CeAnsiToUtf8:= @Ansi2Utf8;
  546. CeUtf8ToAnsi:= @Utf82Ansi;
  547. end;
  548. end;
  549. if not SystemEncodingUtf8 then
  550. begin
  551. CeUtf8ToSys:= @UTF82Sys;
  552. CeSysToUtf8:= @Sys2UTF8;
  553. end;
  554. end;
  555. WriteLn('SystemLocale ', SystemLocale);
  556. WriteLn('SystemLanguage ', SystemLanguage);
  557. WriteLn('SystemEncoding ', SystemEncoding);
  558. WriteLn('DefaultSystemCodePage ', DefaultSystemCodePage);
  559. WriteLn('DefaultFileSystemCodePage ', DefaultFileSystemCodePage);
  560. WriteLn('DefaultRTLFileSystemCodePage ', DefaultRTLFileSystemCodePage);
  561. end;
  562. {$ELSE}
  563. procedure Initialize;
  564. begin
  565. CeOemToSys:= @Dummy;
  566. CeSysToOem:= @Dummy;
  567. CeOemToUtf8:= @Dummy;
  568. CeUtf8ToOem:= @Dummy;
  569. CeAnsiToSys:= @Dummy;
  570. CeSysToAnsi:= @Dummy;
  571. CeUtf8ToSys:= @Dummy;
  572. CeSysToUtf8:= @Dummy;
  573. CeAnsiToUtf8:= @Dummy;
  574. CeUtf8ToAnsi:= @Dummy;
  575. end;
  576. {$ENDIF}
  577. initialization
  578. {$IF DEFINED(FPC_HAS_CPSTRING)}
  579. {$IF DEFINED(MSWINDOWS)}
  580. FileSystemCodePage:= Windows.GetACP;
  581. {$ELSE}
  582. FileSystemCodePage:= WideStringManager.GetStandardCodePageProc(scpFileSystemSingleByte);
  583. {$ENDIF}
  584. {$ENDIF}
  585. Initialize;
  586. end.