cwstring.pp 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(linux) or defined(solaris)}
  22. {$define iconv_is_in_libc}
  23. {$endif}
  24. {$ifdef netbsd}
  25. {$ifndef DISABLE_ICONV_LIBC}
  26. {$define iconv_is_in_libc}
  27. {$endif}
  28. {$endif}
  29. {$ifdef dragonfly}
  30. { iconv function are included inside libc for DragonFly version 6.3 }
  31. { See https://man.dragonflybsd.org/?command=iconv&section=3 }
  32. {$ifndef DISABLE_ICONV_LIBC}
  33. {$define iconv_is_in_libc}
  34. {$endif}
  35. {$endif}
  36. { Modern FreeBSD releases also claim to have iconv
  37. function inside libc, at least
  38. this is stated explicitly in iconv(3) man
  39. of FreeBSD from 10.0 release
  40. PM 2023-05-05 }
  41. {$ifdef freebsd}
  42. {$ifndef DISABLE_ICONV_LIBC}
  43. {$define iconv_is_in_libc}
  44. {$endif}
  45. {$endif}
  46. {$ifndef iconv_is_in_libc}
  47. {$if defined(haiku)}
  48. {$linklib textencoding}
  49. {$else}
  50. {$linklib iconv}
  51. {$endif}
  52. {$define useiconv}
  53. {$endif not iconv_is_in_libc}
  54. {$i rtldefs.inc}
  55. Uses
  56. BaseUnix,
  57. ctypes,
  58. unix,
  59. unixtype,
  60. initc,
  61. dynlibs,
  62. unixcp;
  63. Const
  64. {$ifndef useiconv}
  65. libiconvname='c'; // is in libc under Linux.
  66. libprefix='lib';
  67. {$else}
  68. {$ifdef haiku}
  69. libiconvname='textencoding'; // is in libtextencoding under Haiku
  70. libprefix='lib';
  71. {$else}
  72. {$if defined(darwin) or defined(aix)}
  73. libiconvname='libiconv';
  74. libprefix='';
  75. {$else}
  76. libiconvname='iconv';
  77. libprefix='lib';
  78. {$endif}
  79. {$endif}
  80. {$endif}
  81. { helper functions from libc }
  82. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  83. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  84. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  85. function strcoll (__s1:PAnsiChar; __s2:PAnsiChar):cint;cdecl;external clib name 'strcoll';
  86. {$ifdef netbsd}
  87. { NetBSD has a new setlocale function defined in /usr/include/locale.h
  88. that should be used }
  89. function setlocale(category: cint; locale: PAnsiChar): PAnsiChar; cdecl; external clib name '__setlocale50';
  90. {$else}
  91. function setlocale(category: cint; locale: PAnsiChar): PAnsiChar; cdecl; external clib name 'setlocale';
  92. {$endif}
  93. {$if not(defined(beos) and not defined(haiku))}
  94. function mbrtowc(pwc: pwchar_t; const s: PAnsiChar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  95. function wcrtomb(s: PAnsiChar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  96. function mbrlen(const s: PAnsiChar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  97. {$else beos}
  98. function mbtowc(pwc: pwchar_t; const s: PAnsiChar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  99. function wctomb(s: PAnsiChar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  100. function mblen(const s: PAnsiChar; n: size_t): size_t; cdecl; external clib name 'mblen';
  101. {$endif beos}
  102. const
  103. {$if defined(linux)}
  104. __LC_CTYPE = 0;
  105. LC_ALL = 6;
  106. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  107. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  108. CODESET = _NL_CTYPE_CODESET_NAME;
  109. {$elseif defined(darwin)}
  110. CODESET = 0;
  111. LC_ALL = 0;
  112. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  113. __LC_CTYPE = 0;
  114. LC_ALL = 0;
  115. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  116. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  117. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  118. {$elseif defined(solaris)}
  119. {$define ACCEPT_646}
  120. CODESET=49;
  121. LC_ALL = 6;
  122. {$elseif defined(beos)}
  123. {$ifdef haiku}
  124. CODESET= 0; // Checked for Haiku
  125. LC_ALL = 0; // Checked for Haiku
  126. {$else}
  127. {$warning check correct value for BeOS}
  128. CODESET=49;
  129. LC_ALL = 6; // Checked for BeOS
  130. {$endif}
  131. ESysEILSEQ = EILSEQ;
  132. {$elseif defined(OpenBSD)}
  133. {$define ACCEPT_646}
  134. CODESET = 51;
  135. LC_ALL = 0;
  136. {$elseif defined(NetBSD)}
  137. {$define ACCEPT_646}
  138. CODESET = 51;
  139. LC_ALL = 0;
  140. {$elseif defined(aix)}
  141. CODESET = 49;
  142. LC_ALL = -1;
  143. {$elseif defined(dragonfly)}
  144. CODESET = 0;
  145. LC_ALL = 0;
  146. __LC_CTYPE = 0;
  147. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  148. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  149. {$else not aix}
  150. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  151. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  152. {$endif}
  153. { unicode encoding name }
  154. {$ifdef FPC_LITTLE_ENDIAN}
  155. unicode_encoding2 = 'UTF-16LE';
  156. unicode_encoding4 = 'UCS-4LE';
  157. {$else FPC_LITTLE_ENDIAN}
  158. {$ifdef AIX}
  159. unicode_encoding2 = 'UTF-16';
  160. {$else AIX}
  161. unicode_encoding2 = 'UTF-16BE';
  162. unicode_encoding4 = 'UCS-4BE';
  163. {$endif AIX}
  164. {$endif FPC_LITTLE_ENDIAN}
  165. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  166. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  167. { that's a libc macro mapped to internal functions/variables }
  168. { and thus not a stable external API on systems where libc }
  169. { breaks backwards compatibility every now and then }
  170. MB_CUR_MAX = 10;
  171. { Requests for iconvctl }
  172. ICONV_TRIVIALP = 0; // int *argument
  173. ICONV_GET_TRANSLITERATE = 1; // int *argument
  174. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  175. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  176. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  177. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  178. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  179. type
  180. piconv_t = ^iconv_t;
  181. iconv_t = pointer;
  182. nl_item = cint;
  183. {$ifdef haiku}
  184. function nl_langinfo(__item:nl_item):PAnsiChar;cdecl;external 'root' name 'nl_langinfo';
  185. {$else}
  186. {$ifndef beos}
  187. function nl_langinfo(__item:nl_item):PAnsiChar;cdecl;external libiconvname name 'nl_langinfo';
  188. {$endif}
  189. {$endif}
  190. {$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
  191. function iconv_open(__tocode:PAnsiChar; __fromcode:PAnsiChar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  192. function iconv(__cd:iconv_t; __inbuf:PPAnsiChar; __inbytesleft:psize_t; __outbuf:PPAnsiChar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  193. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  194. const
  195. iconvctlname='iconvctl';
  196. {$else}
  197. function iconv_open(__tocode:PAnsiChar; __fromcode:PAnsiChar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  198. function iconv(__cd:iconv_t; __inbuf:PPAnsiChar; __inbytesleft:psize_t; __outbuf:PPAnsiChar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  199. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  200. const
  201. iconvctlname='libiconvctl';
  202. {$endif}
  203. var
  204. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  205. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  206. threadvar
  207. iconv_ansi2wide,
  208. iconv_wide2ansi : iconv_t;
  209. { since we cache the iconv_t converters, we have to do the same
  210. for the DefaultSystemCodePage variable since if it changes, we
  211. have to re-initialize the converters too. We can't do that via
  212. a callback in the widestring manager because DefaultSystemCodePage
  213. is not a threadvar and we can't automatically change this in all
  214. threads }
  215. current_DefaultSystemCodePage: TSystemCodePage;
  216. procedure InitThread;
  217. var
  218. transliterate: cint;
  219. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  220. iconvindex: longint;
  221. {$endif}
  222. iconvname, toencoding: rawbytestring;
  223. begin
  224. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  225. {$if declared(iconvindex)}
  226. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  227. if iconvindex<>-1 then
  228. iconvname:=UnixCpMap[iconvindex].name
  229. else
  230. { default to UTF-8 on Unix platforms }
  231. iconvname:='UTF-8';
  232. {$else}
  233. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  234. iconvname:='UTF-8';
  235. {$endif}
  236. toencoding:=iconvname;
  237. if not assigned(iconvctl) then
  238. toencoding:=toencoding+'//TRANSLIT';
  239. iconv_wide2ansi:=iconv_open(PAnsiChar(toencoding),unicode_encoding2);
  240. iconv_ansi2wide:=iconv_open(unicode_encoding2,PAnsiChar(iconvname));
  241. if assigned(iconvctl) and
  242. (iconv_wide2ansi<>iconv_t(-1)) then
  243. begin
  244. transliterate:=1;
  245. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  246. end;
  247. end;
  248. procedure FiniThread;
  249. begin
  250. if (iconv_wide2ansi <> iconv_t(-1)) then
  251. iconv_close(iconv_wide2ansi);
  252. if (iconv_ansi2wide <> iconv_t(-1)) then
  253. iconv_close(iconv_ansi2wide);
  254. end;
  255. {$if defined(beos) and not defined(haiku)}
  256. function nl_langinfo(__item:nl_item):PAnsiChar;
  257. begin
  258. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  259. // Now implement the minimum required to correctly initialize WideString support
  260. case __item of
  261. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  262. else
  263. begin
  264. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  265. Result := '';
  266. end;
  267. end;
  268. end;
  269. {$endif}
  270. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: PAnsiChar; cp_is_from: boolean): iconv_t;
  271. var
  272. iconvindex: longint;
  273. toencoding: rawbytestring;
  274. transliterate: cint;
  275. begin
  276. { TODO: add caching (then we also don't need separate code for
  277. the default system page and other ones)
  278. -- typecasting an ansistring function result to PAnsiChar is
  279. unsafe normally, but these are constant strings -> no
  280. problem }
  281. open_iconv_for_cps:=iconv_t(-1);
  282. iconvindex:=GetCodepageData(cp);
  283. if iconvindex=-1 then
  284. exit;
  285. repeat
  286. if cp_is_from then
  287. open_iconv_for_cps:=iconv_open(otherencoding,PAnsiChar(UnixCpMap[iconvindex].name))
  288. else
  289. begin
  290. toencoding:=UnixCpMap[iconvindex].name;
  291. if not assigned(iconvctl) then
  292. toencoding:=toencoding+'//TRANSLIT';
  293. open_iconv_for_cps:=iconv_open(PAnsiChar(toencoding),otherencoding);
  294. end;
  295. inc(iconvindex);
  296. until (open_iconv_for_cps<>iconv_t(-1)) or
  297. (iconvindex>high(UnixCpMap)) or
  298. (UnixCpMap[iconvindex].cp<>cp);
  299. if not cp_is_from and
  300. (open_iconv_for_cps<>iconv_t(-1)) and
  301. assigned(iconvctl) then
  302. begin
  303. transliterate:=1;
  304. iconvctl(open_iconv_for_cps,ICONV_SET_TRANSLITERATE,@transliterate);
  305. end;
  306. end;
  307. {$ifdef aix}
  308. {$i cwstraix.inc}
  309. {$endif aix}
  310. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  311. var
  312. outlength,
  313. outoffset,
  314. srclen,
  315. outleft : size_t;
  316. use_iconv: iconv_t;
  317. srcpos : pwidechar;
  318. destpos: PAnsiChar;
  319. mynil : PAnsiChar;
  320. my0 : size_t;
  321. err : longint;
  322. transliterate: cint;
  323. free_iconv: boolean;
  324. {$ifdef aix}
  325. intermediate: rawbytestring;
  326. {$endif aix}
  327. begin
  328. {$ifdef aix}
  329. { AIX libiconv does not support converting cp866 to anything else except
  330. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  331. if cp=866 then
  332. begin
  333. Wide2AnsiMove(source,intermediate,28595,len);
  334. if handle_aix_intermediate(PAnsiChar(intermediate),28595,cp,dest,len) then
  335. exit;
  336. end;
  337. {$endif aix}
  338. if (cp=DefaultSystemCodePage) then
  339. begin
  340. { update iconv converter in case the DefaultSystemCodePage has been
  341. changed }
  342. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  343. begin
  344. FiniThread;
  345. InitThread;
  346. end;
  347. use_iconv:=iconv_wide2ansi;
  348. free_iconv:=false;
  349. end
  350. else
  351. begin
  352. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  353. if (use_iconv<>iconv_t(-1)) and
  354. assigned(iconvctl) then
  355. begin
  356. transliterate:=1;
  357. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  358. end;
  359. free_iconv:=true;
  360. end;
  361. { unsupported encoding -> default move }
  362. if use_iconv=iconv_t(-1) then
  363. begin
  364. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  365. exit;
  366. end;
  367. mynil:=nil;
  368. my0:=0;
  369. { rought estimation }
  370. setlength(dest,len*3);
  371. outlength:=len*3;
  372. srclen:=len*2;
  373. srcpos:=source;
  374. destpos:=PAnsiChar(dest);
  375. outleft:=outlength;
  376. while iconv(use_iconv,PPAnsiChar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  377. begin
  378. err:=fpgetCerrno;
  379. case err of
  380. { last character is incomplete sequence }
  381. ESysEINVAL,
  382. { incomplete sequence in the middle }
  383. ESysEILSEQ:
  384. begin
  385. { skip and set to '?' }
  386. inc(srcpos);
  387. dec(srclen,2);
  388. destpos^:='?';
  389. inc(destpos);
  390. dec(outleft);
  391. { reset }
  392. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  393. if err=ESysEINVAL then
  394. break;
  395. end;
  396. ESysE2BIG:
  397. begin
  398. outoffset:=destpos-PAnsiChar(dest);
  399. { extend }
  400. setlength(dest,outlength+len*3);
  401. inc(outleft,len*3);
  402. inc(outlength,len*3);
  403. { string could have been moved }
  404. destpos:=PAnsiChar(dest)+outoffset;
  405. end;
  406. else
  407. runerror(231);
  408. end;
  409. end;
  410. // truncate string
  411. setlength(dest,length(dest)-outleft);
  412. SetCodePage(dest,cp,false);
  413. if free_iconv then
  414. iconv_close(use_iconv);
  415. end;
  416. procedure Ansi2WideMove(source:PAnsiChar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  417. var
  418. outlength,
  419. outoffset,
  420. outleft : size_t;
  421. use_iconv: iconv_t;
  422. srcpos,
  423. destpos: PAnsiChar;
  424. mynil : PAnsiChar;
  425. my0 : size_t;
  426. err: cint;
  427. iconvindex: longint;
  428. free_iconv: boolean;
  429. {$ifdef aix}
  430. intermediate: rawbytestring;
  431. {$endif aix}
  432. begin
  433. {$ifdef aix}
  434. { AIX libiconv does not support converting cp866 to anything else except
  435. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  436. if cp=866 then
  437. begin
  438. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  439. source:=PAnsiChar(intermediate);
  440. end;
  441. {$endif aix}
  442. if (cp=DefaultSystemCodePage) then
  443. begin
  444. { update iconv converter in case the DefaultSystemCodePage has been
  445. changed }
  446. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  447. begin
  448. FiniThread;
  449. InitThread;
  450. end;
  451. use_iconv:=iconv_ansi2wide;
  452. free_iconv:=false;
  453. end
  454. else
  455. begin
  456. { TODO: add caching (then we also don't need separate code for
  457. the default system page and other ones)
  458. -- typecasting an ansistring function result to PAnsiChar is
  459. unsafe normally, but these are constant strings -> no
  460. problem }
  461. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  462. free_iconv:=true;
  463. end;
  464. { unsupported encoding -> default move }
  465. if use_iconv=iconv_t(-1) then
  466. begin
  467. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  468. exit;
  469. end;
  470. mynil:=nil;
  471. my0:=0;
  472. // extra space
  473. outlength:=len+1;
  474. setlength(dest,outlength);
  475. srcpos:=source;
  476. destpos:=PAnsiChar(dest);
  477. outleft:=outlength*2;
  478. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  479. begin
  480. err:=fpgetCerrno;
  481. case err of
  482. ESysEINVAL,
  483. ESysEILSEQ:
  484. begin
  485. { skip and set to '?' }
  486. inc(srcpos);
  487. dec(len);
  488. pwidechar(destpos)^:='?';
  489. inc(destpos,2);
  490. dec(outleft,2);
  491. { reset }
  492. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  493. if err=ESysEINVAL then
  494. break;
  495. end;
  496. ESysE2BIG:
  497. begin
  498. outoffset:=destpos-PAnsiChar(dest);
  499. { extend }
  500. setlength(dest,outlength+len);
  501. inc(outleft,len*2);
  502. inc(outlength,len);
  503. { string could have been moved }
  504. destpos:=PAnsiChar(dest)+outoffset;
  505. end;
  506. else
  507. runerror(231);
  508. end;
  509. end;
  510. // truncate string
  511. setlength(dest,length(dest)-outleft div 2);
  512. if free_iconv then
  513. iconv_close(use_iconv);
  514. end;
  515. function LowerWideString(const s : WideString) : WideString;
  516. var
  517. i : SizeInt;
  518. begin
  519. SetLength(result,length(s));
  520. for i:=0 to length(s)-1 do
  521. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  522. end;
  523. function UpperWideString(const s : WideString) : WideString;
  524. var
  525. i : SizeInt;
  526. begin
  527. SetLength(result,length(s));
  528. for i:=0 to length(s)-1 do
  529. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  530. end;
  531. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  532. begin
  533. if (len>length(s)) then
  534. if (length(s) < 10*256) then
  535. setlength(s,length(s)+10)
  536. else
  537. setlength(s,length(s)+length(s) shr 8);
  538. end;
  539. procedure ConcatCharToAnsiStr(const c: AnsiChar; var S: AnsiString; var index: SizeInt);
  540. begin
  541. EnsureAnsiLen(s,index);
  542. PAnsiChar(@s[index])^:=c;
  543. inc(index);
  544. end;
  545. { concatenates an utf-32 Char to a ansistring. S *must* be unique when entering. }
  546. {$if not(defined(beos) and not defined(haiku))}
  547. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  548. {$else not beos}
  549. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  550. {$endif beos}
  551. var
  552. p : PAnsiChar;
  553. mblen : size_t;
  554. begin
  555. { we know that s is unique -> avoid uniquestring calls}
  556. p:=@s[index];
  557. if (nc<=127) then
  558. ConcatCharToAnsiStr(AnsiChar(nc),s,index)
  559. else
  560. begin
  561. EnsureAnsiLen(s,index+MB_CUR_MAX);
  562. {$if not(defined(beos) and not defined(haiku))}
  563. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  564. {$else not beos}
  565. mblen:=wctomb(p,wchar_t(nc));
  566. {$endif not beos}
  567. if (mblen<>size_t(-1)) then
  568. inc(index,mblen)
  569. else
  570. begin
  571. { invalid wide AnsiChar }
  572. p^:='?';
  573. inc(index);
  574. end;
  575. end;
  576. end;
  577. function LowerAnsiString(const s : AnsiString) : AnsiString;
  578. var
  579. i, slen,
  580. resindex : SizeInt;
  581. mblen : size_t;
  582. {$if not(defined(beos) and not defined(haiku))}
  583. ombstate,
  584. nmbstate : mbstate_t;
  585. {$endif beos}
  586. wc : wchar_t;
  587. begin
  588. {$if not(defined(beos) and not defined(haiku))}
  589. fillchar(ombstate,sizeof(ombstate),0);
  590. fillchar(nmbstate,sizeof(nmbstate),0);
  591. {$endif beos}
  592. slen:=length(s);
  593. SetLength(result,slen+10);
  594. i:=1;
  595. resindex:=1;
  596. while (i<=slen) do
  597. begin
  598. if (s[i]<=#127) then
  599. begin
  600. wc:=wchar_t(s[i]);
  601. mblen:= 1;
  602. end
  603. else
  604. {$if not(defined(beos) and not defined(haiku))}
  605. mblen:=mbrtowc(@wc, PAnsiChar(@s[i]), slen-i+1, @ombstate);
  606. {$else not beos}
  607. mblen:=mbtowc(@wc, PAnsiChar(@s[i]), slen-i+1);
  608. {$endif not beos}
  609. case mblen of
  610. size_t(-2):
  611. begin
  612. { partial invalid character, copy literally }
  613. while (i<=slen) do
  614. begin
  615. ConcatCharToAnsiStr(s[i],result,resindex);
  616. inc(i);
  617. end;
  618. end;
  619. size_t(-1), 0:
  620. begin
  621. { invalid or null character }
  622. ConcatCharToAnsiStr(s[i],result,resindex);
  623. inc(i);
  624. end;
  625. else
  626. begin
  627. { a valid sequence }
  628. { even if mblen = 1, the lowercase version may have a }
  629. { different length }
  630. { We can't do anything special if wchar_t is 16 bit... }
  631. {$if not(defined(beos) and not defined(haiku))}
  632. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  633. {$else not beos}
  634. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  635. {$endif not beos}
  636. inc(i,mblen);
  637. end;
  638. end;
  639. end;
  640. SetLength(result,resindex-1);
  641. end;
  642. function UpperAnsiString(const s : AnsiString) : AnsiString;
  643. var
  644. i, slen,
  645. resindex : SizeInt;
  646. mblen : size_t;
  647. {$if not(defined(beos) and not defined(haiku))}
  648. ombstate,
  649. nmbstate : mbstate_t;
  650. {$endif beos}
  651. wc : wchar_t;
  652. begin
  653. {$if not(defined(beos) and not defined(haiku))}
  654. fillchar(ombstate,sizeof(ombstate),0);
  655. fillchar(nmbstate,sizeof(nmbstate),0);
  656. {$endif beos}
  657. slen:=length(s);
  658. SetLength(result,slen+10);
  659. i:=1;
  660. resindex:=1;
  661. while (i<=slen) do
  662. begin
  663. if (s[i]<=#127) then
  664. begin
  665. wc:=wchar_t(s[i]);
  666. mblen:= 1;
  667. end
  668. else
  669. {$if not(defined(beos) and not defined(haiku))}
  670. mblen:=mbrtowc(@wc, PAnsiChar(@s[i]), slen-i+1, @ombstate);
  671. {$else not beos}
  672. mblen:=mbtowc(@wc, PAnsiChar(@s[i]), slen-i+1);
  673. {$endif beos}
  674. case mblen of
  675. size_t(-2):
  676. begin
  677. { partial invalid character, copy literally }
  678. while (i<=slen) do
  679. begin
  680. ConcatCharToAnsiStr(s[i],result,resindex);
  681. inc(i);
  682. end;
  683. end;
  684. size_t(-1), 0:
  685. begin
  686. { invalid or null character }
  687. ConcatCharToAnsiStr(s[i],result,resindex);
  688. inc(i);
  689. end;
  690. else
  691. begin
  692. { a valid sequence }
  693. { even if mblen = 1, the uppercase version may have a }
  694. { different length }
  695. { We can't do anything special if wchar_t is 16 bit... }
  696. {$if not(defined(beos) and not defined(haiku))}
  697. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  698. {$else not beos}
  699. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  700. {$endif not beos}
  701. inc(i,mblen);
  702. end;
  703. end;
  704. end;
  705. SetLength(result,resindex-1);
  706. end;
  707. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  708. var
  709. i, slen,
  710. destindex : SizeInt;
  711. uch : UCS4Char;
  712. begin
  713. slen:=length(s);
  714. setlength(result,slen+1);
  715. i:=1;
  716. destindex:=0;
  717. while (i<=slen) do
  718. begin
  719. uch:=UCS4Char(s[i]);
  720. if (uch=0) then
  721. result[destindex]:=32
  722. else if (uch<=$d7ff) or (uch>=$e000) then
  723. result[destindex]:=uch
  724. else if (uch<=$dbff) and
  725. (i<slen) and
  726. (s[i+1]>=#$dc00) and
  727. (s[i+1]<=#$dfff) then
  728. begin
  729. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  730. inc(i);
  731. end
  732. else { invalid surrogate pair }
  733. result[destindex]:=uch;
  734. inc(i);
  735. inc(destindex);
  736. end;
  737. result[destindex]:=UCS4Char(0);
  738. { Trimming length in this particular case is just a waste of time,
  739. because result will be interpreted as null-terminated and discarded
  740. almost immediately }
  741. end;
  742. function CompareWideString(const s1, s2 : WideString; Options : TCompareOptions) : PtrInt;
  743. {$if not(defined (aix) and defined(cpupowerpc32))}
  744. var
  745. hs1,hs2 : UCS4String;
  746. us1,us2 : WideString;
  747. begin
  748. { wcscoll interprets null chars as end-of-string -> filter out }
  749. if coIgnoreCase in Options then
  750. begin
  751. us1:=UpperWideString(s1);
  752. us2:=UpperWideString(s2);
  753. end
  754. else
  755. begin
  756. us1:=s1;
  757. us2:=s2;
  758. end;
  759. hs1:=WideStringToUCS4StringNoNulls(us1);
  760. hs2:=WideStringToUCS4StringNoNulls(us2);
  761. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  762. end;
  763. {$else}
  764. { AIX/PPC32 has a 16 bit wchar_t }
  765. var
  766. i, len: longint;
  767. us1,us2 : WideString;
  768. hs1, hs2: array of widechar;
  769. begin
  770. if coIgnoreCase in Options then
  771. begin
  772. us1:=UpperWideString(s1);
  773. us2:=UpperWideString(s2);
  774. end
  775. else
  776. begin
  777. us1:=s1;
  778. us2:=s2;
  779. end;
  780. len:=length(us1);
  781. setlength(hs1,len+1);
  782. for i:=1 to len do
  783. if us1[i]<>#0 then
  784. hs1[i-1]:=us1[i]
  785. else
  786. hs1[i-1]:=#32;
  787. hs1[len]:=#0;
  788. len:=length(us2);
  789. setlength(hs2,len+1);
  790. for i:=1 to len do
  791. if us2[i]<>#0 then
  792. hs2[i-1]:=us2[i]
  793. else
  794. hs2[i-1]:=#32;
  795. hs2[len]:=#0;
  796. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  797. end;
  798. {$endif}
  799. { return value: number of code points in the string. Whenever an invalid
  800. code point is encountered, all characters part of this invalid code point
  801. are considered to form one "character" and the next character is
  802. considered to be the start of a new (possibly also invalid) code point }
  803. function CharLengthPChar(const Str: PAnsiChar): PtrInt;
  804. var
  805. nextlen: ptrint;
  806. s: PAnsiChar;
  807. {$if not(defined(beos) and not defined(haiku))}
  808. mbstate: mbstate_t;
  809. {$endif not beos}
  810. begin
  811. result:=0;
  812. s:=str;
  813. {$if not(defined(beos) and not defined(haiku))}
  814. fillchar(mbstate,sizeof(mbstate),0);
  815. {$endif not beos}
  816. repeat
  817. {$if defined(beos) and not defined(haiku)}
  818. nextlen:=ptrint(mblen(s,MB_CUR_MAX));
  819. {$else beos}
  820. nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate));
  821. {$endif beos}
  822. { skip invalid/incomplete sequences }
  823. if (nextlen<0) then
  824. nextlen:=1;
  825. inc(result,1);
  826. inc(s,nextlen);
  827. until (nextlen=0);
  828. end;
  829. function CodePointLength(const Str: PAnsiChar; maxlookahead: ptrint): PtrInt;
  830. {$if not(defined(beos) and not defined(haiku))}
  831. var
  832. mbstate: mbstate_t;
  833. {$endif not beos}
  834. begin
  835. {$if defined(beos) and not defined(haiku)}
  836. result:=ptrint(mblen(str,maxlookahead));
  837. {$else beos}
  838. fillchar(mbstate,sizeof(mbstate),0);
  839. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  840. { mbrlen can also return -2 for "incomplete but potially valid character
  841. and data has been processed" }
  842. if result<0 then
  843. result:=-1;
  844. {$endif beos}
  845. end;
  846. function StrCompAnsiIntern(s1,s2 : PAnsiChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  847. var
  848. a,b: PAnsiChar;
  849. i: PtrInt;
  850. begin
  851. if not(canmodifys1) then
  852. getmem(a,len1+1)
  853. else
  854. a:=s1;
  855. for i:=0 to len1-1 do
  856. if s1[i]<>#0 then
  857. a[i]:=s1[i]
  858. else
  859. a[i]:=#32;
  860. a[len1]:=#0;
  861. if not(canmodifys2) then
  862. getmem(b,len2+1)
  863. else
  864. b:=s2;
  865. for i:=0 to len2-1 do
  866. if s2[i]<>#0 then
  867. b[i]:=s2[i]
  868. else
  869. b[i]:=#32;
  870. b[len2]:=#0;
  871. result:=strcoll(a,b);
  872. if not(canmodifys1) then
  873. freemem(a);
  874. if not(canmodifys2) then
  875. freemem(b);
  876. end;
  877. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  878. begin
  879. result:=StrCompAnsiIntern(PAnsiChar(s1),PAnsiChar(s2),length(s1),length(s2),false,false);
  880. end;
  881. function StrCompAnsi(s1,s2 : PAnsiChar): PtrInt;
  882. begin
  883. result:=strcoll(s1,s2);
  884. end;
  885. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  886. var
  887. a, b: AnsiString;
  888. begin
  889. a:=UpperAnsistring(s1);
  890. b:=UpperAnsistring(s2);
  891. result:=StrCompAnsiIntern(PAnsiChar(a),PAnsiChar(b),length(a),length(b),true,true);
  892. end;
  893. function AnsiStrIComp(S1, S2: PAnsiChar): PtrInt;
  894. begin
  895. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  896. end;
  897. function AnsiStrLComp(S1, S2: PAnsiChar; MaxLen: PtrUInt): PtrInt;
  898. var
  899. a, b: PAnsiChar;
  900. begin
  901. if (maxlen=0) then
  902. exit(0);
  903. if (s1[maxlen]<>#0) then
  904. begin
  905. getmem(a,maxlen+1);
  906. move(s1^,a^,maxlen);
  907. a[maxlen]:=#0;
  908. end
  909. else
  910. a:=s1;
  911. if (s2[maxlen]<>#0) then
  912. begin
  913. getmem(b,maxlen+1);
  914. move(s2^,b^,maxlen);
  915. b[maxlen]:=#0;
  916. end
  917. else
  918. b:=s2;
  919. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  920. if (a<>s1) then
  921. freemem(a);
  922. if (b<>s2) then
  923. freemem(b);
  924. end;
  925. function AnsiStrLIComp(S1, S2: PAnsiChar; MaxLen: PtrUInt): PtrInt;
  926. var
  927. a, b: ansistring;
  928. begin
  929. if (maxlen=0) then
  930. exit(0);
  931. setlength(a,maxlen);
  932. move(s1^,a[1],maxlen);
  933. setlength(b,maxlen);
  934. move(s2^,b[1],maxlen);
  935. result:=AnsiCompareText(a,b);
  936. end;
  937. procedure ansi2pchar(const s: ansistring; const orgp: PAnsiChar; out p: PAnsiChar);
  938. var
  939. newlen: sizeint;
  940. begin
  941. newlen:=length(s);
  942. if newlen>strlen(orgp) then
  943. fpc_rangeerror;
  944. p:=orgp;
  945. if (newlen>0) then
  946. move(s[1],p[0],newlen);
  947. p[newlen]:=#0;
  948. end;
  949. function AnsiStrLower(Str: PAnsiChar): PAnsiChar;
  950. var
  951. temp: ansistring;
  952. begin
  953. temp:=loweransistring(str);
  954. ansi2pchar(temp,str,result);
  955. end;
  956. function AnsiStrUpper(Str: PAnsiChar): PAnsiChar;
  957. var
  958. temp: ansistring;
  959. begin
  960. temp:=upperansistring(str);
  961. ansi2pchar(temp,str,result);
  962. end;
  963. function envvarset(const varname: PAnsiChar): boolean;
  964. var
  965. varval: PAnsiChar;
  966. begin
  967. varval:=fpgetenv(varname);
  968. result:=
  969. assigned(varval) and
  970. (varval[0]<>#0);
  971. end;
  972. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  973. var
  974. langinfo: PAnsiChar;
  975. begin
  976. {$ifdef FPCRTL_FILESYSTEM_UTF8}
  977. if stdcp=scpFileSystemSingleByte then
  978. begin
  979. result:=CP_UTF8;
  980. exit;
  981. end;
  982. {$endif}
  983. { if none of the relevant LC_* environment variables are set, fall back to
  984. UTF-8 (this happens under some versions of OS X for GUI applications, which
  985. otherwise get CP_ASCII) }
  986. if envvarset('LC_ALL') or
  987. envvarset('LC_CTYPE') or
  988. envvarset('LANG') then
  989. begin
  990. langinfo:=nl_langinfo(CODESET);
  991. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  992. that causes it to return an empty string of UTF-8 locales
  993. -> patch up (and in general, UTF-8 is a good default on
  994. Unix platforms) }
  995. if not assigned(langinfo) or
  996. (langinfo^=#0) then
  997. langinfo:='UTF-8';
  998. Result:=GetCodepageByName(ansistring(langinfo));
  999. end
  1000. else
  1001. Result:=unixcp.GetSystemCodepage;
  1002. end;
  1003. {$ifdef FPC_HAS_CPSTRING}
  1004. procedure SetStdIOCodePage(var T: Text); inline;
  1005. begin
  1006. case TextRec(T).Mode of
  1007. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  1008. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  1009. end;
  1010. end;
  1011. procedure SetStdIOCodePages; inline;
  1012. begin
  1013. SetStdIOCodePage(Input);
  1014. SetStdIOCodePage(Output);
  1015. SetStdIOCodePage(ErrOutput);
  1016. SetStdIOCodePage(StdOut);
  1017. SetStdIOCodePage(StdErr);
  1018. end;
  1019. {$endif FPC_HAS_CPSTRING}
  1020. var
  1021. OrgWideStringManager: TUnicodeStringManager;
  1022. Procedure SetCWideStringManager;
  1023. Var
  1024. CWideStringManager : TUnicodeStringManager;
  1025. begin
  1026. GetUnicodeStringManager(OrgWideStringManager);
  1027. CWideStringManager:=OrgWideStringManager;
  1028. With CWideStringManager do
  1029. begin
  1030. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  1031. Ansi2WideMoveProc:=@Ansi2WideMove;
  1032. UpperWideStringProc:=@UpperWideString;
  1033. LowerWideStringProc:=@LowerWideString;
  1034. CompareWideStringProc:=@CompareWideString;
  1035. // CompareTextWideStringProc:=@CompareTextWideString;
  1036. CharLengthPCharProc:=@CharLengthPChar;
  1037. CodePointLengthProc:=@CodePointLength;
  1038. UpperAnsiStringProc:=@UpperAnsiString;
  1039. LowerAnsiStringProc:=@LowerAnsiString;
  1040. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  1041. CompareTextAnsiStringProc:=@AnsiCompareText;
  1042. StrCompAnsiStringProc:=@StrCompAnsi;
  1043. StrICompAnsiStringProc:=@AnsiStrIComp;
  1044. StrLCompAnsiStringProc:=@AnsiStrLComp;
  1045. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  1046. StrLowerAnsiStringProc:=@AnsiStrLower;
  1047. StrUpperAnsiStringProc:=@AnsiStrUpper;
  1048. ThreadInitProc:=@InitThread;
  1049. ThreadFiniProc:=@FiniThread;
  1050. { Unicode }
  1051. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  1052. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  1053. UpperUnicodeStringProc:=@UpperWideString;
  1054. LowerUnicodeStringProc:=@LowerWideString;
  1055. CompareUnicodeStringProc:=@CompareWideString;
  1056. { CodePage }
  1057. GetStandardCodePageProc:=@GetStandardCodePage;
  1058. end;
  1059. SetUnicodeStringManager(CWideStringManager);
  1060. end;
  1061. var
  1062. iconvlib:TLibHandle;
  1063. initialization
  1064. SetCWideStringManager;
  1065. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  1066. { with the information from the environment variables according to POSIX }
  1067. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  1068. setlocale(LC_ALL,'');
  1069. { load iconv library and iconvctl function }
  1070. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  1071. if iconvlib=0 then
  1072. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix+'.6');
  1073. if iconvlib<>0 then
  1074. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  1075. { set the DefaultSystemCodePage }
  1076. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  1077. DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
  1078. DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
  1079. {$ifdef FPC_HAS_CPSTRING}
  1080. SetStdIOCodePages;
  1081. {$endif FPC_HAS_CPSTRING}
  1082. { init conversion tables for main program }
  1083. InitThread;
  1084. finalization
  1085. { fini conversion tables for main program }
  1086. FiniThread;
  1087. { unload iconv library }
  1088. if iconvlib<>0 then
  1089. FreeLibrary(iconvlib);
  1090. { restore previous (probably default) widestring manager so that subsequent calls
  1091. into the widestring manager won't trigger the finalized functionality }
  1092. SetWideStringManager(OrgWideStringManager);
  1093. end.