cwstring.pp 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$modeswitch unicodestrings-}
  14. {$h-}
  15. {$inline on}
  16. {$implicitexceptions off}
  17. unit cwstring;
  18. interface
  19. procedure SetCWidestringManager;
  20. implementation
  21. {$linklib c}
  22. // Linux (and maybe glibc platforms in general), have iconv in glibc.
  23. {$if defined(linux) or defined(solaris)}
  24. {$define iconv_is_in_libc}
  25. {$endif}
  26. {$ifdef netbsd}
  27. {$ifndef DISABLE_ICONV_LIBC}
  28. {$define iconv_is_in_libc}
  29. {$endif}
  30. {$endif}
  31. { Modern FreeBSD releases also claim to have iconv
  32. function inside libc, at least
  33. this is stated explicitly in iconv(3) man
  34. of FreeBSD from 10.0 release
  35. PM 2023-05-05 }
  36. {$ifdef freebsd}
  37. {$ifndef DISABLE_ICONV_LIBC}
  38. {$define iconv_is_in_libc}
  39. {$endif}
  40. {$endif}
  41. {$ifndef iconv_is_in_libc}
  42. {$if defined(haiku)}
  43. {$linklib textencoding}
  44. {$else}
  45. {$linklib iconv}
  46. {$endif}
  47. {$define useiconv}
  48. {$endif not iconv_is_in_libc}
  49. {$i rtldefs.inc}
  50. Uses
  51. BaseUnix,
  52. ctypes,
  53. unix,
  54. unixtype,
  55. initc,
  56. dynlibs,
  57. unixcp;
  58. Const
  59. {$ifndef useiconv}
  60. libiconvname='c'; // is in libc under Linux.
  61. libprefix='lib';
  62. {$else}
  63. {$ifdef haiku}
  64. libiconvname='textencoding'; // is in libtextencoding under Haiku
  65. libprefix='lib';
  66. {$else}
  67. {$if defined(darwin) or defined(aix)}
  68. libiconvname='libiconv';
  69. libprefix='';
  70. {$else}
  71. libiconvname='iconv';
  72. libprefix='lib';
  73. {$endif}
  74. {$endif}
  75. {$endif}
  76. { helper functions from libc }
  77. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  78. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  79. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  80. function strcoll (__s1:PAnsiChar; __s2:PAnsiChar):cint;cdecl;external clib name 'strcoll';
  81. {$ifdef netbsd}
  82. { NetBSD has a new setlocale function defined in /usr/include/locale.h
  83. that should be used }
  84. function setlocale(category: cint; locale: PAnsiChar): PAnsiChar; cdecl; external clib name '__setlocale50';
  85. {$else}
  86. function setlocale(category: cint; locale: PAnsiChar): PAnsiChar; cdecl; external clib name 'setlocale';
  87. {$endif}
  88. {$if not(defined(beos) and not defined(haiku))}
  89. function mbrtowc(pwc: pwchar_t; const s: PAnsiChar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  90. function wcrtomb(s: PAnsiChar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  91. function mbrlen(const s: PAnsiChar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  92. {$else beos}
  93. function mbtowc(pwc: pwchar_t; const s: PAnsiChar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  94. function wctomb(s: PAnsiChar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  95. function mblen(const s: PAnsiChar; n: size_t): size_t; cdecl; external clib name 'mblen';
  96. {$endif beos}
  97. const
  98. {$if defined(linux)}
  99. __LC_CTYPE = 0;
  100. LC_ALL = 6;
  101. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  102. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  103. CODESET = _NL_CTYPE_CODESET_NAME;
  104. {$elseif defined(darwin)}
  105. CODESET = 0;
  106. LC_ALL = 0;
  107. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  108. __LC_CTYPE = 0;
  109. LC_ALL = 0;
  110. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  111. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  112. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  113. {$elseif defined(solaris)}
  114. {$define ACCEPT_646}
  115. CODESET=49;
  116. LC_ALL = 6;
  117. {$elseif defined(beos)}
  118. {$ifdef haiku}
  119. CODESET= 0; // Checked for Haiku
  120. LC_ALL = 0; // Checked for Haiku
  121. {$else}
  122. {$warning check correct value for BeOS}
  123. CODESET=49;
  124. LC_ALL = 6; // Checked for BeOS
  125. {$endif}
  126. ESysEILSEQ = EILSEQ;
  127. {$elseif defined(OpenBSD)}
  128. {$define ACCEPT_646}
  129. CODESET = 51;
  130. LC_ALL = 0;
  131. {$elseif defined(NetBSD)}
  132. {$define ACCEPT_646}
  133. CODESET = 51;
  134. LC_ALL = 0;
  135. {$elseif defined(aix)}
  136. CODESET = 49;
  137. LC_ALL = -1;
  138. {$elseif defined(dragonfly)}
  139. CODESET = 0;
  140. LC_ALL = 0;
  141. __LC_CTYPE = 0;
  142. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  143. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  144. {$else not aix}
  145. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  146. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  147. {$endif}
  148. { unicode encoding name }
  149. {$ifdef FPC_LITTLE_ENDIAN}
  150. unicode_encoding2 = 'UTF-16LE';
  151. unicode_encoding4 = 'UCS-4LE';
  152. {$else FPC_LITTLE_ENDIAN}
  153. {$ifdef AIX}
  154. unicode_encoding2 = 'UTF-16';
  155. {$else AIX}
  156. unicode_encoding2 = 'UTF-16BE';
  157. unicode_encoding4 = 'UCS-4BE';
  158. {$endif AIX}
  159. {$endif FPC_LITTLE_ENDIAN}
  160. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  161. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  162. { that's a libc macro mapped to internal functions/variables }
  163. { and thus not a stable external API on systems where libc }
  164. { breaks backwards compatibility every now and then }
  165. MB_CUR_MAX = 10;
  166. { Requests for iconvctl }
  167. ICONV_TRIVIALP = 0; // int *argument
  168. ICONV_GET_TRANSLITERATE = 1; // int *argument
  169. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  170. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  171. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  172. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  173. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  174. type
  175. piconv_t = ^iconv_t;
  176. iconv_t = pointer;
  177. nl_item = cint;
  178. {$ifdef haiku}
  179. function nl_langinfo(__item:nl_item):PAnsiChar;cdecl;external 'root' name 'nl_langinfo';
  180. {$else}
  181. {$ifndef beos}
  182. function nl_langinfo(__item:nl_item):PAnsiChar;cdecl;external libiconvname name 'nl_langinfo';
  183. {$endif}
  184. {$endif}
  185. {$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
  186. function iconv_open(__tocode:PAnsiChar; __fromcode:PAnsiChar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  187. function iconv(__cd:iconv_t; __inbuf:PPAnsiChar; __inbytesleft:psize_t; __outbuf:PPAnsiChar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  188. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  189. const
  190. iconvctlname='iconvctl';
  191. {$else}
  192. function iconv_open(__tocode:PAnsiChar; __fromcode:PAnsiChar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  193. function iconv(__cd:iconv_t; __inbuf:PPAnsiChar; __inbytesleft:psize_t; __outbuf:PPAnsiChar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  194. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  195. const
  196. iconvctlname='libiconvctl';
  197. {$endif}
  198. var
  199. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  200. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  201. Const
  202. UTF8Name : ShortString = 'UTF-8';
  203. TransLitname : ShortString ='//TRANSLIT';
  204. threadvar
  205. iconv_ansi2wide,
  206. iconv_wide2ansi : iconv_t;
  207. { since we cache the iconv_t converters, we have to do the same
  208. for the DefaultSystemCodePage variable since if it changes, we
  209. have to re-initialize the converters too. We can't do that via
  210. a callback in the widestring manager because DefaultSystemCodePage
  211. is not a threadvar and we can't automatically change this in all
  212. threads }
  213. current_DefaultSystemCodePage: TSystemCodePage;
  214. procedure InitThread;
  215. var
  216. transliterate: cint;
  217. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  218. iconvindex: longint;
  219. {$endif}
  220. iconvname, toencoding: shortstring;
  221. begin
  222. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  223. {$if declared(iconvindex)}
  224. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  225. if iconvindex<>-1 then
  226. iconvname:=UnixCpMap[iconvindex].name
  227. else
  228. { default to UTF-8 on Unix platforms }
  229. iconvname:=UTF8Name;
  230. {$else}
  231. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  232. iconvname:='UTF-8';
  233. {$endif}
  234. toencoding:=iconvname;
  235. if not assigned(iconvctl) then
  236. toencoding:=toencoding+TransLitName;
  237. iconv_wide2ansi:=iconv_open(PAnsiChar(@toencoding[1]),unicode_encoding2);
  238. iconv_ansi2wide:=iconv_open(unicode_encoding2,PAnsiChar(@iconvname[1]));
  239. if assigned(iconvctl) and
  240. (iconv_wide2ansi<>iconv_t(-1)) then
  241. begin
  242. transliterate:=1;
  243. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  244. end;
  245. end;
  246. procedure FiniThread;
  247. begin
  248. if (iconv_wide2ansi <> iconv_t(-1)) then
  249. iconv_close(iconv_wide2ansi);
  250. if (iconv_ansi2wide <> iconv_t(-1)) then
  251. iconv_close(iconv_ansi2wide);
  252. end;
  253. {$if defined(beos) and not defined(haiku)}
  254. function nl_langinfo(__item:nl_item):PAnsiChar;
  255. begin
  256. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  257. // Now implement the minimum required to correctly initialize WideString support
  258. case __item of
  259. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  260. else
  261. begin
  262. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  263. Result := '';
  264. end;
  265. end;
  266. end;
  267. {$endif}
  268. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: PAnsiChar; cp_is_from: boolean): iconv_t;
  269. var
  270. iconvindex: longint;
  271. toencoding: rawbytestring;
  272. transliterate: cint;
  273. begin
  274. { TODO: add caching (then we also don't need separate code for
  275. the default system page and other ones)
  276. -- typecasting an ansistring function result to PAnsiChar is
  277. unsafe normally, but these are constant strings -> no
  278. problem }
  279. open_iconv_for_cps:=iconv_t(-1);
  280. iconvindex:=GetCodepageData(cp);
  281. if iconvindex=-1 then
  282. exit;
  283. repeat
  284. if cp_is_from then
  285. open_iconv_for_cps:=iconv_open(otherencoding,PAnsiChar(UnixCpMap[iconvindex].name))
  286. else
  287. begin
  288. toencoding:=UnixCpMap[iconvindex].name;
  289. if not assigned(iconvctl) then
  290. toencoding:=toencoding+'//TRANSLIT';
  291. open_iconv_for_cps:=iconv_open(PAnsiChar(toencoding),otherencoding);
  292. end;
  293. inc(iconvindex);
  294. until (open_iconv_for_cps<>iconv_t(-1)) or
  295. (iconvindex>high(UnixCpMap)) or
  296. (UnixCpMap[iconvindex].cp<>cp);
  297. if not cp_is_from and
  298. (open_iconv_for_cps<>iconv_t(-1)) and
  299. assigned(iconvctl) then
  300. begin
  301. transliterate:=1;
  302. iconvctl(open_iconv_for_cps,ICONV_SET_TRANSLITERATE,@transliterate);
  303. end;
  304. end;
  305. {$ifdef aix}
  306. {$i cwstraix.inc}
  307. {$endif aix}
  308. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  309. var
  310. outlength,
  311. outoffset,
  312. srclen,
  313. outleft : size_t;
  314. use_iconv: iconv_t;
  315. srcpos : pwidechar;
  316. destpos: PAnsiChar;
  317. mynil : PAnsiChar;
  318. my0 : size_t;
  319. err : longint;
  320. transliterate: cint;
  321. free_iconv: boolean;
  322. {$ifdef aix}
  323. intermediate: rawbytestring;
  324. {$endif aix}
  325. begin
  326. {$ifdef aix}
  327. { AIX libiconv does not support converting cp866 to anything else except
  328. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  329. if cp=866 then
  330. begin
  331. Wide2AnsiMove(source,intermediate,28595,len);
  332. if handle_aix_intermediate(PAnsiChar(intermediate),28595,cp,dest,len) then
  333. exit;
  334. end;
  335. {$endif aix}
  336. if (cp=DefaultSystemCodePage) then
  337. begin
  338. { update iconv converter in case the DefaultSystemCodePage has been
  339. changed }
  340. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  341. begin
  342. FiniThread;
  343. InitThread;
  344. end;
  345. use_iconv:=iconv_wide2ansi;
  346. free_iconv:=false;
  347. end
  348. else
  349. begin
  350. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  351. if (use_iconv<>iconv_t(-1)) and
  352. assigned(iconvctl) then
  353. begin
  354. transliterate:=1;
  355. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  356. end;
  357. free_iconv:=true;
  358. end;
  359. { unsupported encoding -> default move }
  360. if use_iconv=iconv_t(-1) then
  361. begin
  362. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  363. exit;
  364. end;
  365. mynil:=nil;
  366. my0:=0;
  367. { rought estimation }
  368. setlength(dest,len*3);
  369. outlength:=len*3;
  370. srclen:=len*2;
  371. srcpos:=source;
  372. destpos:=PAnsiChar(dest);
  373. outleft:=outlength;
  374. while iconv(use_iconv,PPAnsiChar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  375. begin
  376. err:=fpgetCerrno;
  377. case err of
  378. { last character is incomplete sequence }
  379. ESysEINVAL,
  380. { incomplete sequence in the middle }
  381. ESysEILSEQ:
  382. begin
  383. { skip and set to '?' }
  384. inc(srcpos);
  385. dec(srclen,2);
  386. destpos^:='?';
  387. inc(destpos);
  388. dec(outleft);
  389. { reset }
  390. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  391. if err=ESysEINVAL then
  392. break;
  393. end;
  394. ESysE2BIG:
  395. begin
  396. outoffset:=destpos-PAnsiChar(dest);
  397. { extend }
  398. setlength(dest,outlength+len*3);
  399. inc(outleft,len*3);
  400. inc(outlength,len*3);
  401. { string could have been moved }
  402. destpos:=PAnsiChar(dest)+outoffset;
  403. end;
  404. else
  405. runerror(231);
  406. end;
  407. end;
  408. // truncate string
  409. setlength(dest,length(dest)-outleft);
  410. SetCodePage(dest,cp,false);
  411. if free_iconv then
  412. iconv_close(use_iconv);
  413. end;
  414. procedure Ansi2WideMove(source:PAnsiChar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  415. var
  416. outlength,
  417. outoffset,
  418. outleft : size_t;
  419. use_iconv: iconv_t;
  420. srcpos,
  421. destpos: PAnsiChar;
  422. mynil : PAnsiChar;
  423. my0 : size_t;
  424. err: cint;
  425. iconvindex: longint;
  426. free_iconv: boolean;
  427. {$ifdef aix}
  428. intermediate: rawbytestring;
  429. {$endif aix}
  430. begin
  431. {$ifdef aix}
  432. { AIX libiconv does not support converting cp866 to anything else except
  433. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  434. if cp=866 then
  435. begin
  436. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  437. source:=PAnsiChar(intermediate);
  438. end;
  439. {$endif aix}
  440. if (cp=DefaultSystemCodePage) then
  441. begin
  442. { update iconv converter in case the DefaultSystemCodePage has been
  443. changed }
  444. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  445. begin
  446. FiniThread;
  447. InitThread;
  448. end;
  449. use_iconv:=iconv_ansi2wide;
  450. free_iconv:=false;
  451. // Using Unicode RTL, we can end up here when iconv_ansi2wide is not yet initialized
  452. if (use_iconv=Nil) then
  453. begin
  454. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  455. exit;
  456. end;
  457. end
  458. else
  459. begin
  460. { TODO: add caching (then we also don't need separate code for
  461. the default system page and other ones)
  462. -- typecasting an ansistring function result to PAnsiChar is
  463. unsafe normally, but these are constant strings -> no
  464. problem }
  465. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  466. free_iconv:=true;
  467. end;
  468. { unsupported encoding -> default move }
  469. if use_iconv=iconv_t(-1) then
  470. begin
  471. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  472. exit;
  473. end;
  474. mynil:=nil;
  475. my0:=0;
  476. // extra space
  477. outlength:=len+1;
  478. setlength(dest,outlength);
  479. srcpos:=source;
  480. destpos:=PAnsiChar(dest);
  481. outleft:=outlength*2;
  482. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  483. begin
  484. err:=fpgetCerrno;
  485. case err of
  486. ESysEINVAL,
  487. ESysEILSEQ:
  488. begin
  489. { skip and set to '?' }
  490. inc(srcpos);
  491. dec(len);
  492. pwidechar(destpos)^:='?';
  493. inc(destpos,2);
  494. dec(outleft,2);
  495. { reset }
  496. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  497. if err=ESysEINVAL then
  498. break;
  499. end;
  500. ESysE2BIG:
  501. begin
  502. outoffset:=destpos-PAnsiChar(dest);
  503. { extend }
  504. setlength(dest,outlength+len);
  505. inc(outleft,len*2);
  506. inc(outlength,len);
  507. { string could have been moved }
  508. destpos:=PAnsiChar(dest)+outoffset;
  509. end;
  510. else
  511. runerror(231);
  512. end;
  513. end;
  514. // truncate string
  515. setlength(dest,length(dest)-outleft div 2);
  516. if free_iconv then
  517. iconv_close(use_iconv);
  518. end;
  519. function LowerWideString(const s : WideString) : WideString;
  520. var
  521. i : SizeInt;
  522. begin
  523. SetLength(result,length(s));
  524. for i:=0 to length(s)-1 do
  525. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  526. end;
  527. function UpperWideString(const s : WideString) : WideString;
  528. var
  529. i : SizeInt;
  530. begin
  531. SetLength(result,length(s));
  532. for i:=0 to length(s)-1 do
  533. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  534. end;
  535. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  536. begin
  537. if (len>length(s)) then
  538. if (length(s) < 10*256) then
  539. setlength(s,length(s)+10)
  540. else
  541. setlength(s,length(s)+length(s) shr 8);
  542. end;
  543. procedure ConcatCharToAnsiStr(const c: AnsiChar; var S: AnsiString; var index: SizeInt);
  544. begin
  545. EnsureAnsiLen(s,index);
  546. PAnsiChar(@s[index])^:=c;
  547. inc(index);
  548. end;
  549. { concatenates an utf-32 Char to a ansistring. S *must* be unique when entering. }
  550. {$if not(defined(beos) and not defined(haiku))}
  551. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  552. {$else not beos}
  553. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  554. {$endif beos}
  555. var
  556. p : PAnsiChar;
  557. mblen : size_t;
  558. begin
  559. { we know that s is unique -> avoid uniquestring calls}
  560. p:=@s[index];
  561. if (nc<=127) then
  562. ConcatCharToAnsiStr(AnsiChar(nc),s,index)
  563. else
  564. begin
  565. EnsureAnsiLen(s,index+MB_CUR_MAX);
  566. {$if not(defined(beos) and not defined(haiku))}
  567. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  568. {$else not beos}
  569. mblen:=wctomb(p,wchar_t(nc));
  570. {$endif not beos}
  571. if (mblen<>size_t(-1)) then
  572. inc(index,mblen)
  573. else
  574. begin
  575. { invalid wide AnsiChar }
  576. p^:='?';
  577. inc(index);
  578. end;
  579. end;
  580. end;
  581. function LowerAnsiString(const s : AnsiString) : AnsiString;
  582. var
  583. i, slen,
  584. resindex : SizeInt;
  585. mblen : size_t;
  586. {$if not(defined(beos) and not defined(haiku))}
  587. ombstate,
  588. nmbstate : mbstate_t;
  589. {$endif beos}
  590. wc : wchar_t;
  591. begin
  592. {$if not(defined(beos) and not defined(haiku))}
  593. fillchar(ombstate,sizeof(ombstate),0);
  594. fillchar(nmbstate,sizeof(nmbstate),0);
  595. {$endif beos}
  596. slen:=length(s);
  597. SetLength(result,slen+10);
  598. i:=1;
  599. resindex:=1;
  600. while (i<=slen) do
  601. begin
  602. if (s[i]<=#127) then
  603. begin
  604. wc:=wchar_t(s[i]);
  605. mblen:= 1;
  606. end
  607. else
  608. {$if not(defined(beos) and not defined(haiku))}
  609. mblen:=mbrtowc(@wc, PAnsiChar(@s[i]), slen-i+1, @ombstate);
  610. {$else not beos}
  611. mblen:=mbtowc(@wc, PAnsiChar(@s[i]), slen-i+1);
  612. {$endif not beos}
  613. case mblen of
  614. size_t(-2):
  615. begin
  616. { partial invalid character, copy literally }
  617. while (i<=slen) do
  618. begin
  619. ConcatCharToAnsiStr(s[i],result,resindex);
  620. inc(i);
  621. end;
  622. end;
  623. size_t(-1), 0:
  624. begin
  625. { invalid or null character }
  626. ConcatCharToAnsiStr(s[i],result,resindex);
  627. inc(i);
  628. end;
  629. else
  630. begin
  631. { a valid sequence }
  632. { even if mblen = 1, the lowercase version may have a }
  633. { different length }
  634. { We can't do anything special if wchar_t is 16 bit... }
  635. {$if not(defined(beos) and not defined(haiku))}
  636. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  637. {$else not beos}
  638. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  639. {$endif not beos}
  640. inc(i,mblen);
  641. end;
  642. end;
  643. end;
  644. SetLength(result,resindex-1);
  645. end;
  646. function UpperAnsiString(const s : AnsiString) : AnsiString;
  647. var
  648. i, slen,
  649. resindex : SizeInt;
  650. mblen : size_t;
  651. {$if not(defined(beos) and not defined(haiku))}
  652. ombstate,
  653. nmbstate : mbstate_t;
  654. {$endif beos}
  655. wc : wchar_t;
  656. begin
  657. {$if not(defined(beos) and not defined(haiku))}
  658. fillchar(ombstate,sizeof(ombstate),0);
  659. fillchar(nmbstate,sizeof(nmbstate),0);
  660. {$endif beos}
  661. slen:=length(s);
  662. SetLength(result,slen+10);
  663. i:=1;
  664. resindex:=1;
  665. while (i<=slen) do
  666. begin
  667. if (s[i]<=#127) then
  668. begin
  669. wc:=wchar_t(s[i]);
  670. mblen:= 1;
  671. end
  672. else
  673. {$if not(defined(beos) and not defined(haiku))}
  674. mblen:=mbrtowc(@wc, PAnsiChar(@s[i]), slen-i+1, @ombstate);
  675. {$else not beos}
  676. mblen:=mbtowc(@wc, PAnsiChar(@s[i]), slen-i+1);
  677. {$endif beos}
  678. case mblen of
  679. size_t(-2):
  680. begin
  681. { partial invalid character, copy literally }
  682. while (i<=slen) do
  683. begin
  684. ConcatCharToAnsiStr(s[i],result,resindex);
  685. inc(i);
  686. end;
  687. end;
  688. size_t(-1), 0:
  689. begin
  690. { invalid or null character }
  691. ConcatCharToAnsiStr(s[i],result,resindex);
  692. inc(i);
  693. end;
  694. else
  695. begin
  696. { a valid sequence }
  697. { even if mblen = 1, the uppercase version may have a }
  698. { different length }
  699. { We can't do anything special if wchar_t is 16 bit... }
  700. {$if not(defined(beos) and not defined(haiku))}
  701. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  702. {$else not beos}
  703. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  704. {$endif not beos}
  705. inc(i,mblen);
  706. end;
  707. end;
  708. end;
  709. SetLength(result,resindex-1);
  710. end;
  711. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  712. var
  713. i, slen,
  714. destindex : SizeInt;
  715. uch : UCS4Char;
  716. begin
  717. slen:=length(s);
  718. setlength(result,slen+1);
  719. i:=1;
  720. destindex:=0;
  721. while (i<=slen) do
  722. begin
  723. uch:=UCS4Char(s[i]);
  724. if (uch=0) then
  725. result[destindex]:=32
  726. else if (uch<=$d7ff) or (uch>=$e000) then
  727. result[destindex]:=uch
  728. else if (uch<=$dbff) and
  729. (i<slen) and
  730. (s[i+1]>=#$dc00) and
  731. (s[i+1]<=#$dfff) then
  732. begin
  733. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  734. inc(i);
  735. end
  736. else { invalid surrogate pair }
  737. result[destindex]:=uch;
  738. inc(i);
  739. inc(destindex);
  740. end;
  741. result[destindex]:=UCS4Char(0);
  742. { Trimming length in this particular case is just a waste of time,
  743. because result will be interpreted as null-terminated and discarded
  744. almost immediately }
  745. end;
  746. function CompareWideString(const s1, s2 : WideString; Options : TCompareOptions) : PtrInt;
  747. {$if not(defined (aix) and defined(cpupowerpc32))}
  748. var
  749. hs1,hs2 : UCS4String;
  750. us1,us2 : WideString;
  751. begin
  752. { wcscoll interprets null chars as end-of-string -> filter out }
  753. if coIgnoreCase in Options then
  754. begin
  755. us1:=UpperWideString(s1);
  756. us2:=UpperWideString(s2);
  757. end
  758. else
  759. begin
  760. us1:=s1;
  761. us2:=s2;
  762. end;
  763. hs1:=WideStringToUCS4StringNoNulls(us1);
  764. hs2:=WideStringToUCS4StringNoNulls(us2);
  765. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  766. end;
  767. {$else}
  768. { AIX/PPC32 has a 16 bit wchar_t }
  769. var
  770. i, len: longint;
  771. us1,us2 : WideString;
  772. hs1, hs2: array of widechar;
  773. begin
  774. if coIgnoreCase in Options then
  775. begin
  776. us1:=UpperWideString(s1);
  777. us2:=UpperWideString(s2);
  778. end
  779. else
  780. begin
  781. us1:=s1;
  782. us2:=s2;
  783. end;
  784. len:=length(us1);
  785. setlength(hs1,len+1);
  786. for i:=1 to len do
  787. if us1[i]<>#0 then
  788. hs1[i-1]:=us1[i]
  789. else
  790. hs1[i-1]:=#32;
  791. hs1[len]:=#0;
  792. len:=length(us2);
  793. setlength(hs2,len+1);
  794. for i:=1 to len do
  795. if us2[i]<>#0 then
  796. hs2[i-1]:=us2[i]
  797. else
  798. hs2[i-1]:=#32;
  799. hs2[len]:=#0;
  800. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  801. end;
  802. {$endif}
  803. { return value: number of code points in the string. Whenever an invalid
  804. code point is encountered, all characters part of this invalid code point
  805. are considered to form one "character" and the next character is
  806. considered to be the start of a new (possibly also invalid) code point }
  807. function CharLengthPChar(const Str: PAnsiChar): PtrInt;
  808. var
  809. nextlen: ptrint;
  810. s: PAnsiChar;
  811. {$if not(defined(beos) and not defined(haiku))}
  812. mbstate: mbstate_t;
  813. {$endif not beos}
  814. begin
  815. result:=0;
  816. s:=str;
  817. {$if not(defined(beos) and not defined(haiku))}
  818. fillchar(mbstate,sizeof(mbstate),0);
  819. {$endif not beos}
  820. repeat
  821. {$if defined(beos) and not defined(haiku)}
  822. nextlen:=ptrint(mblen(s,MB_CUR_MAX));
  823. {$else beos}
  824. nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate));
  825. {$endif beos}
  826. { skip invalid/incomplete sequences }
  827. if (nextlen<0) then
  828. nextlen:=1;
  829. inc(result,1);
  830. inc(s,nextlen);
  831. until (nextlen=0);
  832. end;
  833. function CodePointLength(const Str: PAnsiChar; maxlookahead: ptrint): PtrInt;
  834. {$if not(defined(beos) and not defined(haiku))}
  835. var
  836. mbstate: mbstate_t;
  837. {$endif not beos}
  838. begin
  839. {$if defined(beos) and not defined(haiku)}
  840. result:=ptrint(mblen(str,maxlookahead));
  841. {$else beos}
  842. fillchar(mbstate,sizeof(mbstate),0);
  843. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  844. { mbrlen can also return -2 for "incomplete but potially valid character
  845. and data has been processed" }
  846. if result<0 then
  847. result:=-1;
  848. {$endif beos}
  849. end;
  850. function StrCompAnsiIntern(s1,s2 : PAnsiChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  851. var
  852. a,b: PAnsiChar;
  853. i: PtrInt;
  854. begin
  855. if not(canmodifys1) then
  856. getmem(a,len1+1)
  857. else
  858. a:=s1;
  859. for i:=0 to len1-1 do
  860. if s1[i]<>#0 then
  861. a[i]:=s1[i]
  862. else
  863. a[i]:=#32;
  864. a[len1]:=#0;
  865. if not(canmodifys2) then
  866. getmem(b,len2+1)
  867. else
  868. b:=s2;
  869. for i:=0 to len2-1 do
  870. if s2[i]<>#0 then
  871. b[i]:=s2[i]
  872. else
  873. b[i]:=#32;
  874. b[len2]:=#0;
  875. result:=strcoll(a,b);
  876. if not(canmodifys1) then
  877. freemem(a);
  878. if not(canmodifys2) then
  879. freemem(b);
  880. end;
  881. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  882. begin
  883. result:=StrCompAnsiIntern(PAnsiChar(s1),PAnsiChar(s2),length(s1),length(s2),false,false);
  884. end;
  885. function StrCompAnsi(s1,s2 : PAnsiChar): PtrInt;
  886. begin
  887. result:=strcoll(s1,s2);
  888. end;
  889. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  890. var
  891. a, b: AnsiString;
  892. begin
  893. a:=UpperAnsistring(s1);
  894. b:=UpperAnsistring(s2);
  895. result:=StrCompAnsiIntern(PAnsiChar(a),PAnsiChar(b),length(a),length(b),true,true);
  896. end;
  897. function AnsiStrIComp(S1, S2: PAnsiChar): PtrInt;
  898. begin
  899. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  900. end;
  901. function AnsiStrLComp(S1, S2: PAnsiChar; MaxLen: PtrUInt): PtrInt;
  902. var
  903. a, b: PAnsiChar;
  904. begin
  905. if (maxlen=0) then
  906. exit(0);
  907. if (s1[maxlen]<>#0) then
  908. begin
  909. getmem(a,maxlen+1);
  910. move(s1^,a^,maxlen);
  911. a[maxlen]:=#0;
  912. end
  913. else
  914. a:=s1;
  915. if (s2[maxlen]<>#0) then
  916. begin
  917. getmem(b,maxlen+1);
  918. move(s2^,b^,maxlen);
  919. b[maxlen]:=#0;
  920. end
  921. else
  922. b:=s2;
  923. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  924. if (a<>s1) then
  925. freemem(a);
  926. if (b<>s2) then
  927. freemem(b);
  928. end;
  929. function AnsiStrLIComp(S1, S2: PAnsiChar; MaxLen: PtrUInt): PtrInt;
  930. var
  931. a, b: ansistring;
  932. begin
  933. if (maxlen=0) then
  934. exit(0);
  935. setlength(a,maxlen);
  936. move(s1^,a[1],maxlen);
  937. setlength(b,maxlen);
  938. move(s2^,b[1],maxlen);
  939. result:=AnsiCompareText(a,b);
  940. end;
  941. procedure ansi2pchar(const s: ansistring; const orgp: PAnsiChar; out p: PAnsiChar);
  942. var
  943. newlen: sizeint;
  944. begin
  945. newlen:=length(s);
  946. if newlen>strlen(orgp) then
  947. fpc_rangeerror;
  948. p:=orgp;
  949. if (newlen>0) then
  950. move(s[1],p[0],newlen);
  951. p[newlen]:=#0;
  952. end;
  953. function AnsiStrLower(Str: PAnsiChar): PAnsiChar;
  954. var
  955. temp: ansistring;
  956. begin
  957. temp:=loweransistring(str);
  958. ansi2pchar(temp,str,result);
  959. end;
  960. function AnsiStrUpper(Str: PAnsiChar): PAnsiChar;
  961. var
  962. temp: ansistring;
  963. begin
  964. temp:=upperansistring(str);
  965. ansi2pchar(temp,str,result);
  966. end;
  967. function envvarset(const varname: PAnsiChar): boolean;
  968. var
  969. varval: PAnsiChar;
  970. begin
  971. varval:=fpgetenv(varname);
  972. result:=
  973. assigned(varval) and
  974. (varval[0]<>#0);
  975. end;
  976. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  977. var
  978. langinfo: PAnsiChar;
  979. begin
  980. {$ifdef FPCRTL_FILESYSTEM_UTF8}
  981. if stdcp=scpFileSystemSingleByte then
  982. begin
  983. result:=CP_UTF8;
  984. exit;
  985. end;
  986. {$endif}
  987. { if none of the relevant LC_* environment variables are set, fall back to
  988. UTF-8 (this happens under some versions of OS X for GUI applications, which
  989. otherwise get CP_ASCII) }
  990. if envvarset('LC_ALL') or
  991. envvarset('LC_CTYPE') or
  992. envvarset('LANG') then
  993. begin
  994. langinfo:=nl_langinfo(CODESET);
  995. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  996. that causes it to return an empty string of UTF-8 locales
  997. -> patch up (and in general, UTF-8 is a good default on
  998. Unix platforms) }
  999. if not assigned(langinfo) or
  1000. (langinfo^=#0) then
  1001. langinfo:='UTF-8';
  1002. Result:=GetCodepageByName(ansistring(langinfo));
  1003. end
  1004. else
  1005. Result:=unixcp.GetSystemCodepage;
  1006. end;
  1007. {$ifdef FPC_HAS_CPSTRING}
  1008. procedure SetStdIOCodePage(var T: Text); inline;
  1009. begin
  1010. case TextRec(T).Mode of
  1011. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  1012. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  1013. end;
  1014. end;
  1015. procedure SetStdIOCodePages; inline;
  1016. begin
  1017. SetStdIOCodePage(Input);
  1018. SetStdIOCodePage(Output);
  1019. SetStdIOCodePage(ErrOutput);
  1020. SetStdIOCodePage(StdOut);
  1021. SetStdIOCodePage(StdErr);
  1022. end;
  1023. {$endif FPC_HAS_CPSTRING}
  1024. var
  1025. OrgWideStringManager: TUnicodeStringManager;
  1026. Procedure SetCWideStringManager;
  1027. Var
  1028. CWideStringManager : TUnicodeStringManager;
  1029. begin
  1030. GetUnicodeStringManager(OrgWideStringManager);
  1031. CWideStringManager:=OrgWideStringManager;
  1032. With CWideStringManager do
  1033. begin
  1034. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  1035. Ansi2WideMoveProc:=@Ansi2WideMove;
  1036. UpperWideStringProc:=@UpperWideString;
  1037. LowerWideStringProc:=@LowerWideString;
  1038. CompareWideStringProc:=@CompareWideString;
  1039. // CompareTextWideStringProc:=@CompareTextWideString;
  1040. CharLengthPCharProc:=@CharLengthPChar;
  1041. CodePointLengthProc:=@CodePointLength;
  1042. UpperAnsiStringProc:=@UpperAnsiString;
  1043. LowerAnsiStringProc:=@LowerAnsiString;
  1044. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  1045. CompareTextAnsiStringProc:=@AnsiCompareText;
  1046. StrCompAnsiStringProc:=@StrCompAnsi;
  1047. StrICompAnsiStringProc:=@AnsiStrIComp;
  1048. StrLCompAnsiStringProc:=@AnsiStrLComp;
  1049. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  1050. StrLowerAnsiStringProc:=@AnsiStrLower;
  1051. StrUpperAnsiStringProc:=@AnsiStrUpper;
  1052. ThreadInitProc:=@InitThread;
  1053. ThreadFiniProc:=@FiniThread;
  1054. { Unicode }
  1055. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  1056. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  1057. UpperUnicodeStringProc:=@UpperWideString;
  1058. LowerUnicodeStringProc:=@LowerWideString;
  1059. CompareUnicodeStringProc:=@CompareWideString;
  1060. { CodePage }
  1061. GetStandardCodePageProc:=@GetStandardCodePage;
  1062. end;
  1063. SetUnicodeStringManager(CWideStringManager);
  1064. end;
  1065. var
  1066. iconvlib:TLibHandle;
  1067. initialization
  1068. SetCWideStringManager;
  1069. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  1070. { with the information from the environment variables according to POSIX }
  1071. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  1072. setlocale(LC_ALL,'');
  1073. { load iconv library and iconvctl function }
  1074. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  1075. if iconvlib=0 then
  1076. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix+'.6');
  1077. if iconvlib<>0 then
  1078. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  1079. { set the DefaultSystemCodePage }
  1080. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  1081. DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
  1082. DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
  1083. {$ifdef FPC_HAS_CPSTRING}
  1084. SetStdIOCodePages;
  1085. {$endif FPC_HAS_CPSTRING}
  1086. { init conversion tables for main program }
  1087. InitThread;
  1088. finalization
  1089. { fini conversion tables for main program }
  1090. FiniThread;
  1091. { unload iconv library }
  1092. if iconvlib<>0 then
  1093. FreeLibrary(iconvlib);
  1094. { restore previous (probably default) widestring manager so that subsequent calls
  1095. into the widestring manager won't trigger the finalized functionality }
  1096. SetWideStringManager(OrgWideStringManager);
  1097. end.