cwstring.pp 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(linux) or defined(solaris)}
  22. {$define iconv_is_in_libc}
  23. {$endif}
  24. {$ifdef netbsd}
  25. {$ifndef DISABLE_ICONV_LIBC}
  26. {$define iconv_is_in_libc}
  27. {$endif}
  28. {$endif}
  29. {$ifdef dragonfly}
  30. { iconv function are included inside libc for DragonFly version 6.3 }
  31. { See https://man.dragonflybsd.org/?command=iconv&section=3 }
  32. {$ifndef DISABLE_ICONV_LIBC}
  33. {$define iconv_is_in_libc}
  34. {$endif}
  35. {$endif}
  36. { Modern FreeBSD releases also claim to have iconv
  37. function inside libc, at least
  38. this is stated explicitly in iconv(3) man
  39. of FreeBSD from 10.0 release
  40. PM 2023-05-05 }
  41. {$ifdef freebsd}
  42. {$ifndef DISABLE_ICONV_LIBC}
  43. {$define iconv_is_in_libc}
  44. {$endif}
  45. {$endif}
  46. {$ifndef iconv_is_in_libc}
  47. {$if defined(haiku)}
  48. {$linklib textencoding}
  49. {$else}
  50. {$linklib iconv}
  51. {$endif}
  52. {$define useiconv}
  53. {$endif not iconv_is_in_libc}
  54. {$i rtldefs.inc}
  55. Uses
  56. BaseUnix,
  57. ctypes,
  58. unix,
  59. unixtype,
  60. initc,
  61. dynlibs,
  62. unixcp;
  63. Const
  64. {$ifndef useiconv}
  65. libiconvname='c'; // is in libc under Linux.
  66. libprefix='lib';
  67. {$else}
  68. {$ifdef haiku}
  69. libiconvname='textencoding'; // is in libtextencoding under Haiku
  70. libprefix='lib';
  71. {$else}
  72. {$if defined(darwin) or defined(aix)}
  73. libiconvname='libiconv';
  74. libprefix='';
  75. {$else}
  76. libiconvname='iconv';
  77. libprefix='lib';
  78. {$endif}
  79. {$endif}
  80. {$endif}
  81. { helper functions from libc }
  82. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  83. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  84. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  85. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  86. {$ifdef netbsd}
  87. { NetBSD has a new setlocale function defined in /usr/include/locale.h
  88. that should be used }
  89. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name '__setlocale50';
  90. {$else}
  91. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  92. {$endif}
  93. {$if not(defined(beos) and not defined(haiku))}
  94. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  95. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  96. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  97. {$else beos}
  98. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  99. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  100. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  101. {$endif beos}
  102. const
  103. {$if defined(linux)}
  104. __LC_CTYPE = 0;
  105. LC_ALL = 6;
  106. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  107. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  108. CODESET = _NL_CTYPE_CODESET_NAME;
  109. {$elseif defined(darwin)}
  110. CODESET = 0;
  111. LC_ALL = 0;
  112. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  113. __LC_CTYPE = 0;
  114. LC_ALL = 0;
  115. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  116. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  117. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  118. {$elseif defined(solaris)}
  119. {$define ACCEPT_646}
  120. CODESET=49;
  121. LC_ALL = 6;
  122. {$elseif defined(beos)}
  123. {$ifdef haiku}
  124. CODESET= 0; // Checked for Haiku
  125. LC_ALL = 0; // Checked for Haiku
  126. {$else}
  127. {$warning check correct value for BeOS}
  128. CODESET=49;
  129. LC_ALL = 6; // Checked for BeOS
  130. {$endif}
  131. ESysEILSEQ = EILSEQ;
  132. {$elseif defined(OpenBSD)}
  133. {$define ACCEPT_646}
  134. CODESET = 51;
  135. LC_ALL = 0;
  136. {$elseif defined(NetBSD)}
  137. {$define ACCEPT_646}
  138. CODESET = 51;
  139. LC_ALL = 0;
  140. {$elseif defined(aix)}
  141. CODESET = 49;
  142. LC_ALL = -1;
  143. {$elseif defined(dragonfly)}
  144. CODESET = 0;
  145. LC_ALL = 0;
  146. __LC_CTYPE = 0;
  147. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  148. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  149. {$else not aix}
  150. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  151. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  152. {$endif}
  153. { unicode encoding name }
  154. {$ifdef FPC_LITTLE_ENDIAN}
  155. unicode_encoding2 = 'UTF-16LE';
  156. unicode_encoding4 = 'UCS-4LE';
  157. {$else FPC_LITTLE_ENDIAN}
  158. {$ifdef AIX}
  159. unicode_encoding2 = 'UTF-16';
  160. {$else AIX}
  161. unicode_encoding2 = 'UTF-16BE';
  162. unicode_encoding4 = 'UCS-4BE';
  163. {$endif AIX}
  164. {$endif FPC_LITTLE_ENDIAN}
  165. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  166. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  167. { that's a libc macro mapped to internal functions/variables }
  168. { and thus not a stable external API on systems where libc }
  169. { breaks backwards compatibility every now and then }
  170. MB_CUR_MAX = 10;
  171. { Requests for iconvctl }
  172. ICONV_TRIVIALP = 0; // int *argument
  173. ICONV_GET_TRANSLITERATE = 1; // int *argument
  174. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  175. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  176. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  177. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  178. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  179. type
  180. piconv_t = ^iconv_t;
  181. iconv_t = pointer;
  182. nl_item = cint;
  183. {$ifdef haiku}
  184. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'root' name 'nl_langinfo';
  185. {$else}
  186. {$ifndef beos}
  187. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  188. {$endif}
  189. {$endif}
  190. {$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
  191. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  192. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  193. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  194. const
  195. iconvctlname='iconvctl';
  196. {$else}
  197. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  198. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  199. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  200. const
  201. iconvctlname='libiconvctl';
  202. {$endif}
  203. var
  204. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  205. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  206. threadvar
  207. iconv_ansi2wide,
  208. iconv_wide2ansi : iconv_t;
  209. { since we cache the iconv_t converters, we have to do the same
  210. for the DefaultSystemCodePage variable since if it changes, we
  211. have to re-initialize the converters too. We can't do that via
  212. a callback in the widestring manager because DefaultSystemCodePage
  213. is not a threadvar and we can't automatically change this in all
  214. threads }
  215. current_DefaultSystemCodePage: TSystemCodePage;
  216. procedure InitThread;
  217. var
  218. transliterate: cint;
  219. iconvindex: longint;
  220. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  221. iconvname: rawbytestring;
  222. {$endif}
  223. begin
  224. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  225. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  226. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  227. if iconvindex<>-1 then
  228. iconvname:=UnixCpMap[iconvindex].name
  229. else
  230. { default to UTF-8 on Unix platforms }
  231. iconvname:='UTF-8';
  232. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  233. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  234. {$else}
  235. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  236. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  237. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  238. {$endif}
  239. if assigned(iconvctl) and
  240. (iconv_wide2ansi<>iconv_t(-1)) then
  241. begin
  242. transliterate:=1;
  243. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  244. end;
  245. end;
  246. procedure FiniThread;
  247. begin
  248. if (iconv_wide2ansi <> iconv_t(-1)) then
  249. iconv_close(iconv_wide2ansi);
  250. if (iconv_ansi2wide <> iconv_t(-1)) then
  251. iconv_close(iconv_ansi2wide);
  252. end;
  253. {$if defined(beos) and not defined(haiku)}
  254. function nl_langinfo(__item:nl_item):pchar;
  255. begin
  256. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  257. // Now implement the minimum required to correctly initialize WideString support
  258. case __item of
  259. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  260. else
  261. begin
  262. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  263. Result := '';
  264. end;
  265. end;
  266. end;
  267. {$endif}
  268. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
  269. var
  270. iconvindex: longint;
  271. begin
  272. { TODO: add caching (then we also don't need separate code for
  273. the default system page and other ones)
  274. -- typecasting an ansistring function result to pchar is
  275. unsafe normally, but these are constant strings -> no
  276. problem }
  277. open_iconv_for_cps:=iconv_t(-1);
  278. iconvindex:=GetCodepageData(cp);
  279. if iconvindex=-1 then
  280. exit;
  281. repeat
  282. if cp_is_from then
  283. open_iconv_for_cps:=iconv_open(otherencoding,pchar(UnixCpMap[iconvindex].name))
  284. else
  285. open_iconv_for_cps:=iconv_open(pchar(UnixCpMap[iconvindex].name),otherencoding);
  286. inc(iconvindex);
  287. until (open_iconv_for_cps<>iconv_t(-1)) or
  288. (iconvindex>high(UnixCpMap)) or
  289. (UnixCpMap[iconvindex].cp<>cp);
  290. end;
  291. {$ifdef aix}
  292. {$i cwstraix.inc}
  293. {$endif aix}
  294. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  295. var
  296. outlength,
  297. outoffset,
  298. srclen,
  299. outleft : size_t;
  300. use_iconv: iconv_t;
  301. srcpos : pwidechar;
  302. destpos: pchar;
  303. mynil : pchar;
  304. my0 : size_t;
  305. err : longint;
  306. transliterate: cint;
  307. free_iconv: boolean;
  308. {$ifdef aix}
  309. intermediate: rawbytestring;
  310. {$endif aix}
  311. begin
  312. {$ifdef aix}
  313. { AIX libiconv does not support converting cp866 to anything else except
  314. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  315. if cp=866 then
  316. begin
  317. Wide2AnsiMove(source,intermediate,28595,len);
  318. if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
  319. exit;
  320. end;
  321. {$endif aix}
  322. if (cp=DefaultSystemCodePage) then
  323. begin
  324. { update iconv converter in case the DefaultSystemCodePage has been
  325. changed }
  326. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  327. begin
  328. FiniThread;
  329. InitThread;
  330. end;
  331. use_iconv:=iconv_wide2ansi;
  332. free_iconv:=false;
  333. end
  334. else
  335. begin
  336. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  337. if (use_iconv<>iconv_t(-1)) and
  338. assigned(iconvctl) then
  339. begin
  340. transliterate:=1;
  341. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  342. end;
  343. free_iconv:=true;
  344. end;
  345. { unsupported encoding -> default move }
  346. if use_iconv=iconv_t(-1) then
  347. begin
  348. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  349. exit;
  350. end;
  351. mynil:=nil;
  352. my0:=0;
  353. { rought estimation }
  354. setlength(dest,len*3);
  355. outlength:=len*3;
  356. srclen:=len*2;
  357. srcpos:=source;
  358. destpos:=pchar(dest);
  359. outleft:=outlength;
  360. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  361. begin
  362. err:=fpgetCerrno;
  363. case err of
  364. { last character is incomplete sequence }
  365. ESysEINVAL,
  366. { incomplete sequence in the middle }
  367. ESysEILSEQ:
  368. begin
  369. { skip and set to '?' }
  370. inc(srcpos);
  371. dec(srclen,2);
  372. destpos^:='?';
  373. inc(destpos);
  374. dec(outleft);
  375. { reset }
  376. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  377. if err=ESysEINVAL then
  378. break;
  379. end;
  380. ESysE2BIG:
  381. begin
  382. outoffset:=destpos-pchar(dest);
  383. { extend }
  384. setlength(dest,outlength+len*3);
  385. inc(outleft,len*3);
  386. inc(outlength,len*3);
  387. { string could have been moved }
  388. destpos:=pchar(dest)+outoffset;
  389. end;
  390. else
  391. runerror(231);
  392. end;
  393. end;
  394. // truncate string
  395. setlength(dest,length(dest)-outleft);
  396. SetCodePage(dest,cp,false);
  397. if free_iconv then
  398. iconv_close(use_iconv);
  399. end;
  400. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  401. var
  402. outlength,
  403. outoffset,
  404. outleft : size_t;
  405. use_iconv: iconv_t;
  406. srcpos,
  407. destpos: pchar;
  408. mynil : pchar;
  409. my0 : size_t;
  410. err: cint;
  411. iconvindex: longint;
  412. free_iconv: boolean;
  413. {$ifdef aix}
  414. intermediate: rawbytestring;
  415. {$endif aix}
  416. begin
  417. {$ifdef aix}
  418. { AIX libiconv does not support converting cp866 to anything else except
  419. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  420. if cp=866 then
  421. begin
  422. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  423. source:=pchar(intermediate);
  424. end;
  425. {$endif aix}
  426. if (cp=DefaultSystemCodePage) then
  427. begin
  428. { update iconv converter in case the DefaultSystemCodePage has been
  429. changed }
  430. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  431. begin
  432. FiniThread;
  433. InitThread;
  434. end;
  435. use_iconv:=iconv_ansi2wide;
  436. free_iconv:=false;
  437. end
  438. else
  439. begin
  440. { TODO: add caching (then we also don't need separate code for
  441. the default system page and other ones)
  442. -- typecasting an ansistring function result to pchar is
  443. unsafe normally, but these are constant strings -> no
  444. problem }
  445. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  446. free_iconv:=true;
  447. end;
  448. { unsupported encoding -> default move }
  449. if use_iconv=iconv_t(-1) then
  450. begin
  451. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  452. exit;
  453. end;
  454. mynil:=nil;
  455. my0:=0;
  456. // extra space
  457. outlength:=len+1;
  458. setlength(dest,outlength);
  459. srcpos:=source;
  460. destpos:=pchar(dest);
  461. outleft:=outlength*2;
  462. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  463. begin
  464. err:=fpgetCerrno;
  465. case err of
  466. ESysEINVAL,
  467. ESysEILSEQ:
  468. begin
  469. { skip and set to '?' }
  470. inc(srcpos);
  471. dec(len);
  472. pwidechar(destpos)^:='?';
  473. inc(destpos,2);
  474. dec(outleft,2);
  475. { reset }
  476. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  477. if err=ESysEINVAL then
  478. break;
  479. end;
  480. ESysE2BIG:
  481. begin
  482. outoffset:=destpos-pchar(dest);
  483. { extend }
  484. setlength(dest,outlength+len);
  485. inc(outleft,len*2);
  486. inc(outlength,len);
  487. { string could have been moved }
  488. destpos:=pchar(dest)+outoffset;
  489. end;
  490. else
  491. runerror(231);
  492. end;
  493. end;
  494. // truncate string
  495. setlength(dest,length(dest)-outleft div 2);
  496. if free_iconv then
  497. iconv_close(use_iconv);
  498. end;
  499. function LowerWideString(const s : WideString) : WideString;
  500. var
  501. i : SizeInt;
  502. begin
  503. SetLength(result,length(s));
  504. for i:=0 to length(s)-1 do
  505. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  506. end;
  507. function UpperWideString(const s : WideString) : WideString;
  508. var
  509. i : SizeInt;
  510. begin
  511. SetLength(result,length(s));
  512. for i:=0 to length(s)-1 do
  513. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  514. end;
  515. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  516. begin
  517. if (len>length(s)) then
  518. if (length(s) < 10*256) then
  519. setlength(s,length(s)+10)
  520. else
  521. setlength(s,length(s)+length(s) shr 8);
  522. end;
  523. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  524. begin
  525. EnsureAnsiLen(s,index);
  526. pchar(@s[index])^:=c;
  527. inc(index);
  528. end;
  529. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  530. {$if not(defined(beos) and not defined(haiku))}
  531. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  532. {$else not beos}
  533. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  534. {$endif beos}
  535. var
  536. p : pchar;
  537. mblen : size_t;
  538. begin
  539. { we know that s is unique -> avoid uniquestring calls}
  540. p:=@s[index];
  541. if (nc<=127) then
  542. ConcatCharToAnsiStr(char(nc),s,index)
  543. else
  544. begin
  545. EnsureAnsiLen(s,index+MB_CUR_MAX);
  546. {$if not(defined(beos) and not defined(haiku))}
  547. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  548. {$else not beos}
  549. mblen:=wctomb(p,wchar_t(nc));
  550. {$endif not beos}
  551. if (mblen<>size_t(-1)) then
  552. inc(index,mblen)
  553. else
  554. begin
  555. { invalid wide char }
  556. p^:='?';
  557. inc(index);
  558. end;
  559. end;
  560. end;
  561. function LowerAnsiString(const s : AnsiString) : AnsiString;
  562. var
  563. i, slen,
  564. resindex : SizeInt;
  565. mblen : size_t;
  566. {$if not(defined(beos) and not defined(haiku))}
  567. ombstate,
  568. nmbstate : mbstate_t;
  569. {$endif beos}
  570. wc : wchar_t;
  571. begin
  572. {$if not(defined(beos) and not defined(haiku))}
  573. fillchar(ombstate,sizeof(ombstate),0);
  574. fillchar(nmbstate,sizeof(nmbstate),0);
  575. {$endif beos}
  576. slen:=length(s);
  577. SetLength(result,slen+10);
  578. i:=1;
  579. resindex:=1;
  580. while (i<=slen) do
  581. begin
  582. if (s[i]<=#127) then
  583. begin
  584. wc:=wchar_t(s[i]);
  585. mblen:= 1;
  586. end
  587. else
  588. {$if not(defined(beos) and not defined(haiku))}
  589. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  590. {$else not beos}
  591. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  592. {$endif not beos}
  593. case mblen of
  594. size_t(-2):
  595. begin
  596. { partial invalid character, copy literally }
  597. while (i<=slen) do
  598. begin
  599. ConcatCharToAnsiStr(s[i],result,resindex);
  600. inc(i);
  601. end;
  602. end;
  603. size_t(-1), 0:
  604. begin
  605. { invalid or null character }
  606. ConcatCharToAnsiStr(s[i],result,resindex);
  607. inc(i);
  608. end;
  609. else
  610. begin
  611. { a valid sequence }
  612. { even if mblen = 1, the lowercase version may have a }
  613. { different length }
  614. { We can't do anything special if wchar_t is 16 bit... }
  615. {$if not(defined(beos) and not defined(haiku))}
  616. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  617. {$else not beos}
  618. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  619. {$endif not beos}
  620. inc(i,mblen);
  621. end;
  622. end;
  623. end;
  624. SetLength(result,resindex-1);
  625. end;
  626. function UpperAnsiString(const s : AnsiString) : AnsiString;
  627. var
  628. i, slen,
  629. resindex : SizeInt;
  630. mblen : size_t;
  631. {$if not(defined(beos) and not defined(haiku))}
  632. ombstate,
  633. nmbstate : mbstate_t;
  634. {$endif beos}
  635. wc : wchar_t;
  636. begin
  637. {$if not(defined(beos) and not defined(haiku))}
  638. fillchar(ombstate,sizeof(ombstate),0);
  639. fillchar(nmbstate,sizeof(nmbstate),0);
  640. {$endif beos}
  641. slen:=length(s);
  642. SetLength(result,slen+10);
  643. i:=1;
  644. resindex:=1;
  645. while (i<=slen) do
  646. begin
  647. if (s[i]<=#127) then
  648. begin
  649. wc:=wchar_t(s[i]);
  650. mblen:= 1;
  651. end
  652. else
  653. {$if not(defined(beos) and not defined(haiku))}
  654. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  655. {$else not beos}
  656. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  657. {$endif beos}
  658. case mblen of
  659. size_t(-2):
  660. begin
  661. { partial invalid character, copy literally }
  662. while (i<=slen) do
  663. begin
  664. ConcatCharToAnsiStr(s[i],result,resindex);
  665. inc(i);
  666. end;
  667. end;
  668. size_t(-1), 0:
  669. begin
  670. { invalid or null character }
  671. ConcatCharToAnsiStr(s[i],result,resindex);
  672. inc(i);
  673. end;
  674. else
  675. begin
  676. { a valid sequence }
  677. { even if mblen = 1, the uppercase version may have a }
  678. { different length }
  679. { We can't do anything special if wchar_t is 16 bit... }
  680. {$if not(defined(beos) and not defined(haiku))}
  681. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  682. {$else not beos}
  683. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  684. {$endif not beos}
  685. inc(i,mblen);
  686. end;
  687. end;
  688. end;
  689. SetLength(result,resindex-1);
  690. end;
  691. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  692. var
  693. i, slen,
  694. destindex : SizeInt;
  695. uch : UCS4Char;
  696. begin
  697. slen:=length(s);
  698. setlength(result,slen+1);
  699. i:=1;
  700. destindex:=0;
  701. while (i<=slen) do
  702. begin
  703. uch:=UCS4Char(s[i]);
  704. if (uch=0) then
  705. result[destindex]:=32
  706. else if (uch<=$d7ff) or (uch>=$e000) then
  707. result[destindex]:=uch
  708. else if (uch<=$dbff) and
  709. (i<slen) and
  710. (s[i+1]>=#$dc00) and
  711. (s[i+1]<=#$dfff) then
  712. begin
  713. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  714. inc(i);
  715. end
  716. else { invalid surrogate pair }
  717. result[destindex]:=uch;
  718. inc(i);
  719. inc(destindex);
  720. end;
  721. result[destindex]:=UCS4Char(0);
  722. { Trimming length in this particular case is just a waste of time,
  723. because result will be interpreted as null-terminated and discarded
  724. almost immediately }
  725. end;
  726. function CompareWideString(const s1, s2 : WideString; Options : TCompareOptions) : PtrInt;
  727. {$if not(defined (aix) and defined(cpupowerpc32))}
  728. var
  729. hs1,hs2 : UCS4String;
  730. us1,us2 : WideString;
  731. begin
  732. { wcscoll interprets null chars as end-of-string -> filter out }
  733. if coIgnoreCase in Options then
  734. begin
  735. us1:=UpperWideString(s1);
  736. us2:=UpperWideString(s2);
  737. end
  738. else
  739. begin
  740. us1:=s1;
  741. us2:=s2;
  742. end;
  743. hs1:=WideStringToUCS4StringNoNulls(us1);
  744. hs2:=WideStringToUCS4StringNoNulls(us2);
  745. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  746. end;
  747. {$else}
  748. { AIX/PPC32 has a 16 bit wchar_t }
  749. var
  750. i, len: longint;
  751. us1,us2 : WideString;
  752. hs1, hs2: array of widechar;
  753. begin
  754. if coIgnoreCase in Options then
  755. begin
  756. us1:=UpperWideString(s1);
  757. us2:=UpperWideString(s2);
  758. end
  759. else
  760. begin
  761. us1:=s1;
  762. us2:=s2;
  763. end;
  764. len:=length(us1);
  765. setlength(hs1,len+1);
  766. for i:=1 to len do
  767. if us1[i]<>#0 then
  768. hs1[i-1]:=us1[i]
  769. else
  770. hs1[i-1]:=#32;
  771. hs1[len]:=#0;
  772. len:=length(us2);
  773. setlength(hs2,len+1);
  774. for i:=1 to len do
  775. if us2[i]<>#0 then
  776. hs2[i-1]:=us2[i]
  777. else
  778. hs2[i-1]:=#32;
  779. hs2[len]:=#0;
  780. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  781. end;
  782. {$endif}
  783. { return value: number of code points in the string. Whenever an invalid
  784. code point is encountered, all characters part of this invalid code point
  785. are considered to form one "character" and the next character is
  786. considered to be the start of a new (possibly also invalid) code point }
  787. function CharLengthPChar(const Str: PChar): PtrInt;
  788. var
  789. nextlen: ptrint;
  790. s: pchar;
  791. {$if not(defined(beos) and not defined(haiku))}
  792. mbstate: mbstate_t;
  793. {$endif not beos}
  794. begin
  795. result:=0;
  796. s:=str;
  797. {$if not(defined(beos) and not defined(haiku))}
  798. fillchar(mbstate,sizeof(mbstate),0);
  799. {$endif not beos}
  800. repeat
  801. {$if defined(beos) and not defined(haiku)}
  802. nextlen:=ptrint(mblen(s,MB_CUR_MAX));
  803. {$else beos}
  804. nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate));
  805. {$endif beos}
  806. { skip invalid/incomplete sequences }
  807. if (nextlen<0) then
  808. nextlen:=1;
  809. inc(result,1);
  810. inc(s,nextlen);
  811. until (nextlen=0);
  812. end;
  813. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  814. {$if not(defined(beos) and not defined(haiku))}
  815. var
  816. mbstate: mbstate_t;
  817. {$endif not beos}
  818. begin
  819. {$if defined(beos) and not defined(haiku)}
  820. result:=ptrint(mblen(str,maxlookahead));
  821. {$else beos}
  822. fillchar(mbstate,sizeof(mbstate),0);
  823. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  824. { mbrlen can also return -2 for "incomplete but potially valid character
  825. and data has been processed" }
  826. if result<0 then
  827. result:=-1;
  828. {$endif beos}
  829. end;
  830. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  831. var
  832. a,b: pchar;
  833. i: PtrInt;
  834. begin
  835. if not(canmodifys1) then
  836. getmem(a,len1+1)
  837. else
  838. a:=s1;
  839. for i:=0 to len1-1 do
  840. if s1[i]<>#0 then
  841. a[i]:=s1[i]
  842. else
  843. a[i]:=#32;
  844. a[len1]:=#0;
  845. if not(canmodifys2) then
  846. getmem(b,len2+1)
  847. else
  848. b:=s2;
  849. for i:=0 to len2-1 do
  850. if s2[i]<>#0 then
  851. b[i]:=s2[i]
  852. else
  853. b[i]:=#32;
  854. b[len2]:=#0;
  855. result:=strcoll(a,b);
  856. if not(canmodifys1) then
  857. freemem(a);
  858. if not(canmodifys2) then
  859. freemem(b);
  860. end;
  861. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  862. begin
  863. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  864. end;
  865. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  866. begin
  867. result:=strcoll(s1,s2);
  868. end;
  869. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  870. var
  871. a, b: AnsiString;
  872. begin
  873. a:=UpperAnsistring(s1);
  874. b:=UpperAnsistring(s2);
  875. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  876. end;
  877. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  878. begin
  879. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  880. end;
  881. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  882. var
  883. a, b: pchar;
  884. begin
  885. if (maxlen=0) then
  886. exit(0);
  887. if (s1[maxlen]<>#0) then
  888. begin
  889. getmem(a,maxlen+1);
  890. move(s1^,a^,maxlen);
  891. a[maxlen]:=#0;
  892. end
  893. else
  894. a:=s1;
  895. if (s2[maxlen]<>#0) then
  896. begin
  897. getmem(b,maxlen+1);
  898. move(s2^,b^,maxlen);
  899. b[maxlen]:=#0;
  900. end
  901. else
  902. b:=s2;
  903. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  904. if (a<>s1) then
  905. freemem(a);
  906. if (b<>s2) then
  907. freemem(b);
  908. end;
  909. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  910. var
  911. a, b: ansistring;
  912. begin
  913. if (maxlen=0) then
  914. exit(0);
  915. setlength(a,maxlen);
  916. move(s1^,a[1],maxlen);
  917. setlength(b,maxlen);
  918. move(s2^,b[1],maxlen);
  919. result:=AnsiCompareText(a,b);
  920. end;
  921. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  922. var
  923. newlen: sizeint;
  924. begin
  925. newlen:=length(s);
  926. if newlen>strlen(orgp) then
  927. fpc_rangeerror;
  928. p:=orgp;
  929. if (newlen>0) then
  930. move(s[1],p[0],newlen);
  931. p[newlen]:=#0;
  932. end;
  933. function AnsiStrLower(Str: PChar): PChar;
  934. var
  935. temp: ansistring;
  936. begin
  937. temp:=loweransistring(str);
  938. ansi2pchar(temp,str,result);
  939. end;
  940. function AnsiStrUpper(Str: PChar): PChar;
  941. var
  942. temp: ansistring;
  943. begin
  944. temp:=upperansistring(str);
  945. ansi2pchar(temp,str,result);
  946. end;
  947. function envvarset(const varname: pchar): boolean;
  948. var
  949. varval: pchar;
  950. begin
  951. varval:=fpgetenv(varname);
  952. result:=
  953. assigned(varval) and
  954. (varval[0]<>#0);
  955. end;
  956. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  957. var
  958. langinfo: pchar;
  959. begin
  960. {$ifdef FPCRTL_FILESYSTEM_UTF8}
  961. if stdcp=scpFileSystemSingleByte then
  962. begin
  963. result:=CP_UTF8;
  964. exit;
  965. end;
  966. {$endif}
  967. { if none of the relevant LC_* environment variables are set, fall back to
  968. UTF-8 (this happens under some versions of OS X for GUI applications, which
  969. otherwise get CP_ASCII) }
  970. if envvarset('LC_ALL') or
  971. envvarset('LC_CTYPE') or
  972. envvarset('LANG') then
  973. begin
  974. langinfo:=nl_langinfo(CODESET);
  975. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  976. that causes it to return an empty string of UTF-8 locales
  977. -> patch up (and in general, UTF-8 is a good default on
  978. Unix platforms) }
  979. if not assigned(langinfo) or
  980. (langinfo^=#0) then
  981. langinfo:='UTF-8';
  982. Result:=GetCodepageByName(ansistring(langinfo));
  983. end
  984. else
  985. Result:=unixcp.GetSystemCodepage;
  986. end;
  987. {$ifdef FPC_HAS_CPSTRING}
  988. procedure SetStdIOCodePage(var T: Text); inline;
  989. begin
  990. case TextRec(T).Mode of
  991. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  992. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  993. end;
  994. end;
  995. procedure SetStdIOCodePages; inline;
  996. begin
  997. SetStdIOCodePage(Input);
  998. SetStdIOCodePage(Output);
  999. SetStdIOCodePage(ErrOutput);
  1000. SetStdIOCodePage(StdOut);
  1001. SetStdIOCodePage(StdErr);
  1002. end;
  1003. {$endif FPC_HAS_CPSTRING}
  1004. var
  1005. OrgWideStringManager: TUnicodeStringManager;
  1006. Procedure SetCWideStringManager;
  1007. Var
  1008. CWideStringManager : TUnicodeStringManager;
  1009. begin
  1010. GetUnicodeStringManager(OrgWideStringManager);
  1011. CWideStringManager:=OrgWideStringManager;
  1012. With CWideStringManager do
  1013. begin
  1014. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  1015. Ansi2WideMoveProc:=@Ansi2WideMove;
  1016. UpperWideStringProc:=@UpperWideString;
  1017. LowerWideStringProc:=@LowerWideString;
  1018. CompareWideStringProc:=@CompareWideString;
  1019. // CompareTextWideStringProc:=@CompareTextWideString;
  1020. CharLengthPCharProc:=@CharLengthPChar;
  1021. CodePointLengthProc:=@CodePointLength;
  1022. UpperAnsiStringProc:=@UpperAnsiString;
  1023. LowerAnsiStringProc:=@LowerAnsiString;
  1024. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  1025. CompareTextAnsiStringProc:=@AnsiCompareText;
  1026. StrCompAnsiStringProc:=@StrCompAnsi;
  1027. StrICompAnsiStringProc:=@AnsiStrIComp;
  1028. StrLCompAnsiStringProc:=@AnsiStrLComp;
  1029. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  1030. StrLowerAnsiStringProc:=@AnsiStrLower;
  1031. StrUpperAnsiStringProc:=@AnsiStrUpper;
  1032. ThreadInitProc:=@InitThread;
  1033. ThreadFiniProc:=@FiniThread;
  1034. { Unicode }
  1035. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  1036. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  1037. UpperUnicodeStringProc:=@UpperWideString;
  1038. LowerUnicodeStringProc:=@LowerWideString;
  1039. CompareUnicodeStringProc:=@CompareWideString;
  1040. { CodePage }
  1041. GetStandardCodePageProc:=@GetStandardCodePage;
  1042. end;
  1043. SetUnicodeStringManager(CWideStringManager);
  1044. end;
  1045. var
  1046. iconvlib:TLibHandle;
  1047. initialization
  1048. SetCWideStringManager;
  1049. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  1050. { with the information from the environment variables according to POSIX }
  1051. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  1052. setlocale(LC_ALL,'');
  1053. { load iconvctl function }
  1054. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  1055. if iconvlib<>0 then
  1056. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  1057. { set the DefaultSystemCodePage }
  1058. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  1059. DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
  1060. DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
  1061. {$ifdef FPC_HAS_CPSTRING}
  1062. SetStdIOCodePages;
  1063. {$endif FPC_HAS_CPSTRING}
  1064. { init conversion tables for main program }
  1065. InitThread;
  1066. finalization
  1067. { fini conversion tables for main program }
  1068. FiniThread;
  1069. { unload iconv library }
  1070. if iconvlib<>0 then
  1071. FreeLibrary(iconvlib);
  1072. { restore previous (probably default) widestring manager so that subsequent calls
  1073. into the widestring manager won't trigger the finalized functionality }
  1074. SetWideStringManager(OrgWideStringManager);
  1075. end.