| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651 |
- unit DCConvertEncoding;
- {$mode objfpc}{$H+}
- {$IF DEFINED(DARWIN)}
- {$modeswitch objectivec1}
- {$ENDIF}
- interface
- uses
- Classes, SysUtils;
- {$IF NOT DECLARED(RawByteString)}
- type
- RawByteString = AnsiString;
- {$IFEND}
- var
- {en
- Convert from OEM to System encoding, if needed
- }
- CeOemToSys: function (const Source: String): RawByteString;
- CeSysToOem: function (const Source: String): RawByteString;
- {en
- Convert from OEM to UTF-8 encoding, if needed
- }
- CeOemToUtf8: function (const Source: String): RawByteString;
- CeUtf8ToOem: function (const Source: String): RawByteString;
- {en
- Convert from Ansi to System encoding, if needed
- }
- CeAnsiToSys: function (const Source: String): RawByteString;
- CeSysToAnsi: function (const Source: String): RawByteString;
- {en
- Convert from ANSI to UTF-8 encoding, if needed
- }
- CeAnsiToUtf8: function (const Source: String): RawByteString;
- CeUtf8ToAnsi: function (const Source: String): RawByteString;
- {en
- Convert from Utf8 to System encoding, if needed
- }
- CeUtf8ToSys: function (const Source: String): RawByteString;
- CeSysToUtf8: function (const Source: String): RawByteString;
- function CeRawToUtf8(const Source: String): RawByteString;
- function CeUtf8ToUtf16(const Source: String): UnicodeString;
- function CeUtf16ToUtf8(const Source: UnicodeString): RawByteString;
- {$IF DEFINED(MSWINDOWS)}
- function CeTryEncode(const aValue: UnicodeString; aCodePage: Cardinal;
- aAllowBestFit: Boolean; out aResult: AnsiString): Boolean;
- function CeTryDecode(const aValue: AnsiString; aCodePage: Cardinal;
- out aResult: UnicodeString): Boolean;
- {$ELSEIF DEFINED(UNIX)}
- var
- SystemEncodingUtf8: Boolean = False;
- SystemEncoding, SystemLocale: String;
- {$ENDIF}
- var
- SystemLanguage: String;
- implementation
- uses
- {$IF DEFINED(UNIX)}
- LazUTF8
- {$IF DEFINED(DARWIN)}
- , dc_iconvenc_dyn, MacOSAll, CocoaAll, StrUtils
- {$ELSE}
- , iconvenc_dyn, UnixCP
- {$ENDIF}
- {$ELSEIF DEFINED(MSWINDOWS)}
- Windows
- {$ENDIF}
- ;
- {$IF DEFINED(FPC_HAS_CPSTRING)}
- var
- FileSystemCodePage: TSystemCodePage;
- {$ENDIF}
- function UTF8CharacterStrictLength(P: PAnsiChar): integer;
- begin
- if p=nil then exit(0);
- if ord(p^)<%10000000 then begin
- // regular single byte character
- exit(1);
- end
- else if ord(p^)<%11000000 then begin
- // invalid single byte character
- exit(0);
- end
- else if ((ord(p^) and %11100000) = %11000000) then begin
- // should be 2 byte character
- if (ord(p[1]) and %11000000) = %10000000 then
- exit(2)
- else
- exit(0);
- end
- else if ((ord(p^) and %11110000) = %11100000) then begin
- // should be 3 byte character
- if ((ord(p[1]) and %11000000) = %10000000)
- and ((ord(p[2]) and %11000000) = %10000000) then
- exit(3)
- else
- exit(0);
- end
- else if ((ord(p^) and %11111000) = %11110000) then begin
- // should be 4 byte character
- if ((ord(p[1]) and %11000000) = %10000000)
- and ((ord(p[2]) and %11000000) = %10000000)
- and ((ord(p[3]) and %11000000) = %10000000) then
- exit(4)
- else
- exit(0);
- end else
- exit(0);
- end;
- function CeRawToUtf8(const Source: String): RawByteString;
- var
- P: PAnsiChar;
- I, L: LongInt;
- begin
- L:= Length(Source);
- // Try UTF-8 (this includes ASCII)
- P:= PAnsiChar(Source);
- repeat
- if Ord(P^) < 128 then begin
- // ASCII
- if (P^ = #0) and (P - PAnsiChar(Source) >= L) then begin
- Result:= Source;
- Exit;
- end;
- Inc(P);
- end else begin
- I:= UTF8CharacterStrictLength(P);
- if I = 0 then Break;
- Inc(P, I);
- end;
- until False;
- Result:= CeSysToUtf8(Source);
- end;
- function CeUtf8ToUtf16(const Source: String): UnicodeString;
- {$IF DEFINED(MSWINDOWS)}
- var
- L: SizeUInt;
- begin
- L:= Length(Source);
- if L = 0 then Exit('');
- SetLength(Result, L + 1);
- // wide chars of UTF-16 <= bytes of UTF-8 string
- SetLength(Result, MultiByteToWideChar(CP_UTF8, 0, PAnsiChar(Source), L, PWideChar(Result), L + 1));
- end;
- {$ELSE}
- var
- L: SizeUInt;
- begin
- L:= Length(Source);
- if L = 0 then Exit('');
- SetLength(Result, L + 1);
- if (ConvertUTF8ToUTF16(PUnicodeChar(Result), L + 1, PAnsiChar(Source), L,
- [toInvalidCharToSymbol], L) = trNoError) then
- begin
- SetLength(Result, L - 1);
- end
- else begin
- SetLength(Result, 0);
- end;
- end;
- {$ENDIF}
- function CeUtf16ToUtf8(const Source: UnicodeString): RawByteString;
- {$IF DEFINED(MSWINDOWS)}
- var
- L: SizeUInt;
- begin
- L:= Length(Source);
- if (L = 0) then Exit('');
- SetLength(Result, L * 3);
- // bytes of UTF-8 <= 3 * wide chars of UTF-16 string
- // e.g. %11100000 10100000 10000000 (UTF-8) is $0800 (UTF-16)
- SetLength(Result, WideCharToMultiByte(CP_UTF8, 0,
- PWideChar(Source), L, PAnsiChar(Result), Length(Result), nil, nil));
- end;
- {$ELSE}
- var
- L: SizeUInt;
- begin
- L:= Length(Source);
- if (L = 0) then Exit('');
- SetLength(Result, L * 3);
- if (ConvertUTF16ToUTF8(PAnsiChar(Result), Length(Result) + 1, PUnicodeChar(Source), L,
- [toInvalidCharToSymbol], L) = trNoError) then
- begin
- SetLength(Result, L - 1);
- end
- else begin
- SetLength(Result, 0);
- end;
- end;
- {$ENDIF}
- function Dummy(const Source: String): RawByteString;
- begin
- Result:= Source;
- end;
- {$IF DEFINED(FPC_HAS_CPSTRING)}
- function Sys2UTF8(const Source: String): RawByteString;
- begin
- Result:= Source;
- SetCodePage(Result, FileSystemCodePage, False);
- SetCodePage(Result, CP_UTF8, True);
- // Prevent another codepage appear in the strings
- // we don't need codepage conversion magic in our code
- SetCodePage(Result, DefaultSystemCodePage, False);
- end;
- function UTF82Sys(const Source: String): RawByteString;
- begin
- Result:= Source;
- SetCodePage(Result, CP_UTF8, False);
- SetCodePage(Result, FileSystemCodePage, True);
- // Prevent another codepage appear in the strings
- // we don't need codepage conversion magic in our code
- SetCodePage(Result, DefaultSystemCodePage, False);
- end;
- {$ELSE}
- function Sys2UTF8(const Source: String): RawByteString;
- begin
- Result:= UTF8Encode(Source);
- end;
- function UTF82Sys(const Source: String): RawByteString;
- begin
- Result:= UTF8Decode(Source);
- end;
- {$ENDIF}
- {$IF DEFINED(MSWINDOWS)}
- function CeTryEncode(const aValue: UnicodeString; aCodePage: Cardinal;
- aAllowBestFit: Boolean; out aResult: AnsiString): Boolean;
- // Try to encode the given Unicode string as the requested codepage
- const
- WC_NO_BEST_FIT_CHARS = $00000400;
- Flags: array[Boolean] of DWORD = (WC_NO_BEST_FIT_CHARS, 0);
- var
- UsedDefault: BOOL;
- begin
- if not aAllowBestFit and not CheckWin32Version(4, 1) then
- Result := False
- else begin
- SetLength(aResult, WideCharToMultiByte(aCodePage, Flags[aAllowBestFit],
- PWideChar(aValue), Length(aValue), nil, 0, nil, @UsedDefault));
- SetLength(aResult, WideCharToMultiByte(aCodePage, Flags[aAllowBestFit],
- PWideChar(aValue), Length(aValue), PAnsiChar(aResult),
- Length(aResult), nil, @UsedDefault));
- Result := not UsedDefault;
- end;
- end;
- function CeTryDecode(const aValue: AnsiString; aCodePage: Cardinal;
- out aResult: UnicodeString): Boolean;
- begin
- SetLength(aResult, MultiByteToWideChar(aCodePage, MB_ERR_INVALID_CHARS,
- LPCSTR(aValue), Length(aValue), nil, 0) * SizeOf(UnicodeChar));
- SetLength(aResult, MultiByteToWideChar(aCodePage, MB_ERR_INVALID_CHARS,
- LPCSTR(aValue), Length(aValue), PWideChar(aResult), Length(aResult)));
- Result := Length(aResult) > 0;
- end;
- function Oem2Utf8(const Source: String): RawByteString;
- var
- UnicodeResult: UnicodeString;
- begin
- if CeTryDecode(Source, CP_OEMCP, UnicodeResult) then
- Result:= CeUtf16ToUtf8(UnicodeResult)
- else
- Result:= Source;
- end;
- function Utf82Oem(const Source: String): RawByteString;
- var
- AnsiResult: AnsiString;
- begin
- if CeTryEncode(CeUtf8ToUtf16(Source), CP_OEMCP, False, AnsiResult) then
- Result:= AnsiResult
- else
- Result:= Source;
- end;
- function OEM2Ansi(const Source: String): RawByteString;
- var
- Dst: PAnsiChar;
- begin
- Result:= Source;
- Dst:= AllocMem((Length(Result) + 1) * SizeOf(AnsiChar));
- if OEMToChar(PAnsiChar(Result), Dst) then
- Result:= StrPas(Dst);
- FreeMem(Dst);
- end;
- function Ansi2OEM(const Source: String): RawByteString;
- var
- Dst: PAnsiChar;
- begin
- Result := Source;
- Dst := AllocMem((Length(Result) + 1) * SizeOf(AnsiChar));
- if CharToOEM(PAnsiChar(Result), Dst) then
- Result := StrPas(Dst);
- FreeMem(Dst);
- end;
- procedure Initialize;
- var
- Buffer: array[1..4] of AnsiChar;
- begin
- CeOemToSys:= @OEM2Ansi;
- CeSysToOem:= @Ansi2OEM;
- CeOemToUtf8:= @Oem2Utf8;
- CeUtf8ToOem:= @Utf82Oem;
- CeAnsiToSys:= @Dummy;
- CeSysToAnsi:= @Dummy;
- CeAnsiToUtf8:= @Sys2UTF8;
- CeUtf8ToAnsi:= @UTF82Sys;
- CeSysToUtf8:= @Sys2UTF8;
- CeUtf8ToSys:= @UTF82Sys;
- if GetLocaleInfo(GetUserDefaultLCID, LOCALE_SABBREVLANGNAME, @Buffer[1], 4) > 0 then
- SystemLanguage := LowerCase(Copy(Buffer, 1, 2));
- end;
- {$ELSEIF DEFINED(UNIX)}
- {$I dcconvertencoding.inc}
- const
- EncodingUTF8 = 'UTF-8'; // UTF-8 Encoding
- var
- EncodingOEM, // OEM Encoding
- EncodingANSI: String; // ANSI Encoding
- function GetSystemEncoding: Boolean;
- {$IF DEFINED(DARWIN)}
- var
- Country: String;
- CurrentLocale: NSLocale;
- LanguageCFRef: CFStringRef = nil;
- LanguageCFArray: CFArrayRef = nil;
- begin
- // System encoding
- SystemEncoding:= EncodingUTF8;
- // Get system language
- LanguageCFArray:= CFLocaleCopyPreferredLanguages;
- try
- Result:= CFArrayGetCount(LanguageCFArray) > 0;
- if Result then
- begin
- LanguageCFRef:= CFArrayGetValueAtIndex(LanguageCFArray, 0);
- SetLength(SystemLanguage, MAX_PATH);
- Result:= CFStringGetCString(LanguageCFRef,
- PAnsiChar(SystemLanguage),
- MAX_PATH,
- kCFStringEncodingUTF8
- );
- end;
- finally
- CFRelease(LanguageCFArray);
- end;
- if Result then
- begin
- // Crop to terminating zero
- SystemLanguage:= PAnsiChar(SystemLanguage);
- SystemLanguage:= Copy2Symb(SystemLanguage, '-');
- // Get system country
- CurrentLocale:= NSLocale.currentLocale();
- Country:= NSString(CurrentLocale.objectForKey(NSLocaleCountryCode)).UTF8String;
- // Combine system locale
- if (Length(SystemLanguage) > 0) and (Length(Country) > 0) then
- begin
- SystemLocale:= SystemLanguage + '_' + Country;
- end;
- end;
- end;
- {$ELSE}
- var
- I: Integer;
- Lang: String;
- begin
- Result:= True;
- Lang:= SysUtils.GetEnvironmentVariable('LC_ALL');
- if Length(Lang) = 0 then
- begin
- Lang:= SysUtils.GetEnvironmentVariable('LC_CTYPE');
- if Length(Lang) = 0 then
- begin
- Lang:= SysUtils.GetEnvironmentVariable('LANG');
- if Length(Lang) = 0 then
- Exit(False);
- end;
- end;
- I:= Pos('_', Lang);
- if (I = 0) then
- SystemLanguage:= Lang
- else begin
- SystemLanguage:= Copy(Lang, 1, I - 1);
- end;
- I:= System.Pos('.', Lang);
- if (I > 0) then
- begin
- SystemLocale:= Copy(Lang, 1, I - 1);
- SystemEncoding:= Copy(Lang, I + 1, Length(Lang) - I);
- end
- else begin
- SystemLocale:= Lang;
- SystemEncoding:= EncodingUTF8;
- end;
- end;
- {$ENDIF}
- {$IF DEFINED(DARWIN)}
- function InitIconv(var Error: String): Boolean;
- begin
- Error:= EmptyStr;
- Result:= TryLoadLib('libiconv.dylib', Error);
- IconvLibFound:= IconvLibFound or Result;
- end;
- {$ELSEIF DEFINED(FPC_HAS_CPSTRING)}
- var
- AManager : TUnicodeStringManager;
- function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
- begin
- Result:= UnixCP.GetSystemCodepage;
- end;
- procedure SetStdIOCodePage(var T: Text); inline;
- begin
- case TextRec(T).Mode of
- fmInput: TextRec(T).CodePage:= GetStandardCodePage(scpConsoleInput);
- fmOutput: TextRec(T).CodePage:= GetStandardCodePage(scpConsoleOutput);
- end;
- end;
- procedure SetStdIOCodePages; inline;
- begin
- SetStdIOCodePage(Input);
- SetStdIOCodePage(Output);
- SetStdIOCodePage(ErrOutput);
- SetStdIOCodePage(StdOut);
- SetStdIOCodePage(StdErr);
- end;
- {$ENDIF}
- function FindEncoding: Boolean;
- var
- Index: Integer;
- begin
- // Try to find by language and country
- for Index:= Low(charset_relation) to High(charset_relation) do
- begin
- if CompareStr(charset_relation[Index, 1], SystemLocale) = 0 then
- begin
- EncodingANSI:= charset_relation[Index, 2];
- EncodingOEM:= charset_relation[Index, 3];
- Exit(True);
- end;
- end;
- // Try to find by language only
- for Index:= Low(charset_relation) to High(charset_relation) do
- begin
- if CompareStr(charset_relation[Index, 0], SystemLanguage) = 0 then
- begin
- EncodingANSI:= charset_relation[Index, 2];
- EncodingOEM:= charset_relation[Index, 3];
- Exit(True);
- end;
- end;
- Result:= False;
- end;
- function Oem2Utf8(const Source: String): RawByteString;
- begin
- Result:= Source;
- Iconvert(Source, String(Result), EncodingOEM, EncodingUTF8);
- end;
- function Utf82Oem(const Source: String): RawByteString;
- begin
- Result:= Source;
- Iconvert(Source, String(Result), EncodingUTF8, EncodingOEM);
- end;
- function OEM2Sys(const Source: String): RawByteString;
- begin
- Result:= Source;
- Iconvert(Source, String(Result), EncodingOEM, SystemEncoding);
- end;
- function Sys2OEM(const Source: String): RawByteString;
- begin
- Result:= Source;
- Iconvert(Source, String(Result), SystemEncoding, EncodingOEM);
- end;
- function Ansi2Sys(const Source: String): RawByteString;
- begin
- Result:= Source;
- Iconvert(Source, String(Result), EncodingANSI, SystemEncoding);
- end;
- function Sys2Ansi(const Source: String): RawByteString;
- begin
- Result:= Source;
- Iconvert(Source, String(Result), SystemEncoding, EncodingANSI);
- end;
- function Ansi2Utf8(const Source: String): RawByteString;
- begin
- Result:= Source;
- Iconvert(Source, String(Result), EncodingANSI, EncodingUTF8);
- end;
- function Utf82Ansi(const Source: String): RawByteString;
- begin
- Result:= Source;
- Iconvert(Source, String(Result), EncodingUTF8, EncodingANSI);
- end;
- procedure Initialize;
- var
- Error: String = '';
- begin
- CeOemToSys:= @Dummy;
- CeSysToOem:= @Dummy;
- CeOemToUtf8:= @Dummy;
- CeUtf8ToOem:= @Dummy;
- CeAnsiToSys:= @Dummy;
- CeSysToAnsi:= @Dummy;
- CeUtf8ToSys:= @Dummy;
- CeSysToUtf8:= @Dummy;
- CeAnsiToUtf8:= @Dummy;
- CeUtf8ToAnsi:= @Dummy;
- {$IF DEFINED(FPC_HAS_CPSTRING) and NOT DEFINED(DARWIN)}
- {
- If locale does not exists then nl_langinfo (called by cwstring unit)
- returns ANSI_X3.4-1968 (CP_ASCII) as system encoding. Try to find correct
- encoding by using environment variables LC_ALL, LC_CTYPE, LANG in this case.
- }
- if DefaultFileSystemCodePage = CP_ASCII then
- begin
- DefaultFileSystemCodePage:= UnixCP.GetSystemCodepage;
- // Use CP_UTF8 if cannot determine system encoding
- if DefaultFileSystemCodePage = CP_ASCII then
- DefaultFileSystemCodePage:= CP_UTF8
- else begin
- GetWideStringManager(AManager);
- AManager.GetStandardCodePageProc:= @GetStandardCodePage;
- SetWideStringManager(AManager);
- end;
- SetStdIOCodePages;
- FileSystemCodePage:= DefaultFileSystemCodePage;
- DefaultSystemCodePage:= DefaultFileSystemCodePage;
- DefaultRTLFileSystemCodePage:= DefaultFileSystemCodePage;
- end;
- {$ENDIF}
- // Try to get system encoding and initialize Iconv library
- if not (GetSystemEncoding and InitIconv(Error)) then
- WriteLn(Error)
- else
- begin
- SystemEncodingUtf8:= (SysUtils.CompareText(SystemEncoding, 'UTF-8') = 0) or
- (SysUtils.CompareText(SystemEncoding, 'UTF8') = 0);
- if FindEncoding then
- begin
- if (Length(EncodingOEM) > 0) then
- begin
- CeOemToSys:= @OEM2Sys;
- CeSysToOem:= @Sys2OEM;
- CeOemToUtf8:= @Oem2Utf8;
- CeUtf8ToOem:= @Utf82Oem;
- end;
- if (Length(EncodingANSI) > 0) then
- begin
- CeAnsiToSys:= @Ansi2Sys;
- CeSysToAnsi:= @Sys2Ansi;
- CeAnsiToUtf8:= @Ansi2Utf8;
- CeUtf8ToAnsi:= @Utf82Ansi;
- end;
- end;
- if not SystemEncodingUtf8 then
- begin
- CeUtf8ToSys:= @UTF82Sys;
- CeSysToUtf8:= @Sys2UTF8;
- end;
- end;
- WriteLn('SystemLocale ', SystemLocale);
- WriteLn('SystemLanguage ', SystemLanguage);
- WriteLn('SystemEncoding ', SystemEncoding);
- WriteLn('DefaultSystemCodePage ', DefaultSystemCodePage);
- WriteLn('DefaultFileSystemCodePage ', DefaultFileSystemCodePage);
- WriteLn('DefaultRTLFileSystemCodePage ', DefaultRTLFileSystemCodePage);
- end;
- {$ELSE}
- procedure Initialize;
- begin
- CeOemToSys:= @Dummy;
- CeSysToOem:= @Dummy;
- CeOemToUtf8:= @Dummy;
- CeUtf8ToOem:= @Dummy;
- CeAnsiToSys:= @Dummy;
- CeSysToAnsi:= @Dummy;
- CeUtf8ToSys:= @Dummy;
- CeSysToUtf8:= @Dummy;
- CeAnsiToUtf8:= @Dummy;
- CeUtf8ToAnsi:= @Dummy;
- end;
- {$ENDIF}
- initialization
- {$IF DEFINED(FPC_HAS_CPSTRING)}
- {$IF DEFINED(MSWINDOWS)}
- FileSystemCodePage:= Windows.GetACP;
- {$ELSE}
- FileSystemCodePage:= WideStringManager.GetStandardCodePageProc(scpFileSystemSingleByte);
- {$ENDIF}
- {$ENDIF}
- Initialize;
- end.
|