cwstring.pp 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(linux) or defined(solaris)}
  22. {$define iconv_is_in_libc}
  23. {$endif}
  24. {$ifdef netbsd}
  25. {$ifdef TEST_ICONV_LIBC}
  26. {$define iconv_is_in_libc}
  27. {$endif}
  28. {$endif}
  29. {$ifndef iconv_is_in_libc}
  30. {$if defined(haiku)}
  31. {$linklib textencoding}
  32. {$else}
  33. {$linklib iconv}
  34. {$endif}
  35. {$define useiconv}
  36. {$endif not iconv_is_in_libc}
  37. {$i rtldefs.inc}
  38. Uses
  39. BaseUnix,
  40. ctypes,
  41. unix,
  42. unixtype,
  43. initc,
  44. dynlibs,
  45. unixcp;
  46. Const
  47. {$ifndef useiconv}
  48. libiconvname='c'; // is in libc under Linux.
  49. libprefix='lib';
  50. {$else}
  51. {$ifdef haiku}
  52. libiconvname='textencoding'; // is in libtextencoding under Haiku
  53. libprefix='lib';
  54. {$else}
  55. {$if defined(darwin) or defined(aix)}
  56. libiconvname='libiconv';
  57. libprefix='';
  58. {$else}
  59. libiconvname='iconv';
  60. libprefix='lib';
  61. {$endif}
  62. {$endif}
  63. {$endif}
  64. { helper functions from libc }
  65. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  66. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  67. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  68. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  69. {$ifdef netbsd}
  70. { NetBSD has a new setlocale function defined in /usr/include/locale.h
  71. that should be used }
  72. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name '__setlocale_mb_len_max_32';
  73. {$else}
  74. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  75. {$endif}
  76. {$ifndef beos}
  77. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  78. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  79. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  80. {$else beos}
  81. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  82. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  83. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  84. {$endif beos}
  85. const
  86. {$if defined(linux)}
  87. __LC_CTYPE = 0;
  88. LC_ALL = 6;
  89. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  90. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  91. CODESET = _NL_CTYPE_CODESET_NAME;
  92. {$elseif defined(darwin)}
  93. CODESET = 0;
  94. LC_ALL = 0;
  95. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  96. __LC_CTYPE = 0;
  97. LC_ALL = 0;
  98. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  99. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  100. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  101. {$elseif defined(solaris)}
  102. {$define ACCEPT_646}
  103. CODESET=49;
  104. LC_ALL = 6;
  105. {$elseif defined(beos)}
  106. {$warning check correct value for BeOS}
  107. CODESET=49;
  108. {$ifdef haiku}
  109. LC_ALL = 0; // Checked for Haiku
  110. {$else}
  111. LC_ALL = 6; // Checked for BeOS
  112. {$endif}
  113. ESysEILSEQ = EILSEQ;
  114. {$elseif defined(OpenBSD)}
  115. {$define ACCEPT_646}
  116. CODESET = 51;
  117. LC_ALL = 0;
  118. {$elseif defined(NetBSD)}
  119. {$define ACCEPT_646}
  120. CODESET = 51;
  121. LC_ALL = 0;
  122. {$elseif defined(aix)}
  123. CODESET = 49;
  124. LC_ALL = -1;
  125. {$else not aix}
  126. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  127. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  128. {$endif}
  129. { unicode encoding name }
  130. {$ifdef FPC_LITTLE_ENDIAN}
  131. unicode_encoding2 = 'UTF-16LE';
  132. unicode_encoding4 = 'UCS-4LE';
  133. {$else FPC_LITTLE_ENDIAN}
  134. {$ifdef AIX}
  135. unicode_encoding2 = 'UTF-16';
  136. {$else AIX}
  137. unicode_encoding2 = 'UTF-16BE';
  138. unicode_encoding4 = 'UCS-4BE';
  139. {$endif AIX}
  140. {$endif FPC_LITTLE_ENDIAN}
  141. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  142. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  143. { that's a libc macro mapped to internal functions/variables }
  144. { and thus not a stable external API on systems where libc }
  145. { breaks backwards compatibility every now and then }
  146. MB_CUR_MAX = 10;
  147. { Requests for iconvctl }
  148. ICONV_TRIVIALP = 0; // int *argument
  149. ICONV_GET_TRANSLITERATE = 1; // int *argument
  150. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  151. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  152. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  153. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  154. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  155. type
  156. piconv_t = ^iconv_t;
  157. iconv_t = pointer;
  158. nl_item = cint;
  159. {$ifdef haiku}
  160. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'root' name 'nl_langinfo';
  161. {$else}
  162. {$ifndef beos}
  163. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  164. {$endif}
  165. {$endif}
  166. {$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
  167. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  168. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  169. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  170. const
  171. iconvctlname='iconvctl';
  172. {$else}
  173. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  174. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  175. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  176. const
  177. iconvctlname='libiconvctl';
  178. {$endif}
  179. var
  180. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  181. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  182. threadvar
  183. iconv_ansi2wide,
  184. iconv_wide2ansi : iconv_t;
  185. { since we cache the iconv_t converters, we have to do the same
  186. for the DefaultSystemCodePage variable since if it changes, we
  187. have to re-initialize the converters too. We can't do that via
  188. a callback in the widestring manager because DefaultSystemCodePage
  189. is not a threadvar and we can't automatically change this in all
  190. threads }
  191. current_DefaultSystemCodePage: TSystemCodePage;
  192. procedure InitThread;
  193. var
  194. transliterate: cint;
  195. iconvindex: longint;
  196. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  197. iconvname: rawbytestring;
  198. {$endif}
  199. begin
  200. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  201. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  202. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  203. if iconvindex<>-1 then
  204. iconvname:=UnixCpMap[iconvindex].name
  205. else
  206. { default to UTF-8 on Unix platforms }
  207. iconvname:='UTF-8';
  208. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  209. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  210. {$else}
  211. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  212. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  213. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  214. {$endif}
  215. if assigned(iconvctl) and
  216. (iconv_wide2ansi<>iconv_t(-1)) then
  217. begin
  218. transliterate:=1;
  219. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  220. end;
  221. end;
  222. procedure FiniThread;
  223. begin
  224. if (iconv_wide2ansi <> iconv_t(-1)) then
  225. iconv_close(iconv_wide2ansi);
  226. if (iconv_ansi2wide <> iconv_t(-1)) then
  227. iconv_close(iconv_ansi2wide);
  228. end;
  229. {$if defined(beos) and not defined(haiku)}
  230. function nl_langinfo(__item:nl_item):pchar;
  231. begin
  232. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  233. // Now implement the minimum required to correctly initialize WideString support
  234. case __item of
  235. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  236. else
  237. begin
  238. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  239. Result := '';
  240. end;
  241. end;
  242. end;
  243. {$endif}
  244. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
  245. var
  246. iconvindex: longint;
  247. begin
  248. { TODO: add caching (then we also don't need separate code for
  249. the default system page and other ones)
  250. -- typecasting an ansistring function result to pchar is
  251. unsafe normally, but these are constant strings -> no
  252. problem }
  253. open_iconv_for_cps:=iconv_t(-1);
  254. iconvindex:=GetCodepageData(cp);
  255. if iconvindex=-1 then
  256. exit;
  257. repeat
  258. if cp_is_from then
  259. open_iconv_for_cps:=iconv_open(otherencoding,pchar(UnixCpMap[iconvindex].name))
  260. else
  261. open_iconv_for_cps:=iconv_open(pchar(UnixCpMap[iconvindex].name),otherencoding);
  262. inc(iconvindex);
  263. until (open_iconv_for_cps<>iconv_t(-1)) or
  264. (iconvindex>high(UnixCpMap)) or
  265. (UnixCpMap[iconvindex].cp<>cp);
  266. end;
  267. {$ifdef aix}
  268. {$i cwstraix.inc}
  269. {$endif aix}
  270. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  271. var
  272. outlength,
  273. outoffset,
  274. srclen,
  275. outleft : size_t;
  276. use_iconv: iconv_t;
  277. srcpos : pwidechar;
  278. destpos: pchar;
  279. mynil : pchar;
  280. my0 : size_t;
  281. err : longint;
  282. transliterate: cint;
  283. free_iconv: boolean;
  284. {$ifdef aix}
  285. intermediate: rawbytestring;
  286. {$endif aix}
  287. begin
  288. {$ifdef aix}
  289. { AIX libiconv does not support converting cp866 to anything else except
  290. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  291. if cp=866 then
  292. begin
  293. Wide2AnsiMove(source,intermediate,28595,len);
  294. if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
  295. exit;
  296. end;
  297. {$endif aix}
  298. if (cp=DefaultSystemCodePage) then
  299. begin
  300. { update iconv converter in case the DefaultSystemCodePage has been
  301. changed }
  302. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  303. begin
  304. FiniThread;
  305. InitThread;
  306. end;
  307. use_iconv:=iconv_wide2ansi;
  308. free_iconv:=false;
  309. end
  310. else
  311. begin
  312. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  313. if (use_iconv<>iconv_t(-1)) and
  314. assigned(iconvctl) then
  315. begin
  316. transliterate:=1;
  317. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  318. end;
  319. free_iconv:=true;
  320. end;
  321. { unsupported encoding -> default move }
  322. if use_iconv=iconv_t(-1) then
  323. begin
  324. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  325. exit;
  326. end;
  327. mynil:=nil;
  328. my0:=0;
  329. { rought estimation }
  330. setlength(dest,len*3);
  331. outlength:=len*3;
  332. srclen:=len*2;
  333. srcpos:=source;
  334. destpos:=pchar(dest);
  335. outleft:=outlength;
  336. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  337. begin
  338. err:=fpgetCerrno;
  339. case err of
  340. { last character is incomplete sequence }
  341. ESysEINVAL,
  342. { incomplete sequence in the middle }
  343. ESysEILSEQ:
  344. begin
  345. { skip and set to '?' }
  346. inc(srcpos);
  347. dec(srclen,2);
  348. destpos^:='?';
  349. inc(destpos);
  350. dec(outleft);
  351. { reset }
  352. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  353. if err=ESysEINVAL then
  354. break;
  355. end;
  356. ESysE2BIG:
  357. begin
  358. outoffset:=destpos-pchar(dest);
  359. { extend }
  360. setlength(dest,outlength+len*3);
  361. inc(outleft,len*3);
  362. inc(outlength,len*3);
  363. { string could have been moved }
  364. destpos:=pchar(dest)+outoffset;
  365. end;
  366. else
  367. runerror(231);
  368. end;
  369. end;
  370. // truncate string
  371. setlength(dest,length(dest)-outleft);
  372. SetCodePage(dest,cp,false);
  373. if free_iconv then
  374. iconv_close(use_iconv);
  375. end;
  376. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  377. var
  378. outlength,
  379. outoffset,
  380. outleft : size_t;
  381. use_iconv: iconv_t;
  382. srcpos,
  383. destpos: pchar;
  384. mynil : pchar;
  385. my0 : size_t;
  386. err: cint;
  387. iconvindex: longint;
  388. free_iconv: boolean;
  389. {$ifdef aix}
  390. intermediate: rawbytestring;
  391. {$endif aix}
  392. begin
  393. {$ifdef aix}
  394. { AIX libiconv does not support converting cp866 to anything else except
  395. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  396. if cp=866 then
  397. begin
  398. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  399. source:=pchar(intermediate);
  400. end;
  401. {$endif aix}
  402. if (cp=DefaultSystemCodePage) then
  403. begin
  404. { update iconv converter in case the DefaultSystemCodePage has been
  405. changed }
  406. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  407. begin
  408. FiniThread;
  409. InitThread;
  410. end;
  411. use_iconv:=iconv_ansi2wide;
  412. free_iconv:=false;
  413. end
  414. else
  415. begin
  416. { TODO: add caching (then we also don't need separate code for
  417. the default system page and other ones)
  418. -- typecasting an ansistring function result to pchar is
  419. unsafe normally, but these are constant strings -> no
  420. problem }
  421. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  422. free_iconv:=true;
  423. end;
  424. { unsupported encoding -> default move }
  425. if use_iconv=iconv_t(-1) then
  426. begin
  427. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  428. exit;
  429. end;
  430. mynil:=nil;
  431. my0:=0;
  432. // extra space
  433. outlength:=len+1;
  434. setlength(dest,outlength);
  435. srcpos:=source;
  436. destpos:=pchar(dest);
  437. outleft:=outlength*2;
  438. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  439. begin
  440. err:=fpgetCerrno;
  441. case err of
  442. ESysEINVAL,
  443. ESysEILSEQ:
  444. begin
  445. { skip and set to '?' }
  446. inc(srcpos);
  447. dec(len);
  448. pwidechar(destpos)^:='?';
  449. inc(destpos,2);
  450. dec(outleft,2);
  451. { reset }
  452. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  453. if err=ESysEINVAL then
  454. break;
  455. end;
  456. ESysE2BIG:
  457. begin
  458. outoffset:=destpos-pchar(dest);
  459. { extend }
  460. setlength(dest,outlength+len);
  461. inc(outleft,len*2);
  462. inc(outlength,len);
  463. { string could have been moved }
  464. destpos:=pchar(dest)+outoffset;
  465. end;
  466. else
  467. runerror(231);
  468. end;
  469. end;
  470. // truncate string
  471. setlength(dest,length(dest)-outleft div 2);
  472. if free_iconv then
  473. iconv_close(use_iconv);
  474. end;
  475. function LowerWideString(const s : WideString) : WideString;
  476. var
  477. i : SizeInt;
  478. begin
  479. SetLength(result,length(s));
  480. for i:=0 to length(s)-1 do
  481. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  482. end;
  483. function UpperWideString(const s : WideString) : WideString;
  484. var
  485. i : SizeInt;
  486. begin
  487. SetLength(result,length(s));
  488. for i:=0 to length(s)-1 do
  489. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  490. end;
  491. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  492. begin
  493. if (len>length(s)) then
  494. if (length(s) < 10*256) then
  495. setlength(s,length(s)+10)
  496. else
  497. setlength(s,length(s)+length(s) shr 8);
  498. end;
  499. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  500. begin
  501. EnsureAnsiLen(s,index);
  502. pchar(@s[index])^:=c;
  503. inc(index);
  504. end;
  505. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  506. {$ifndef beos}
  507. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  508. {$else not beos}
  509. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  510. {$endif beos}
  511. var
  512. p : pchar;
  513. mblen : size_t;
  514. begin
  515. { we know that s is unique -> avoid uniquestring calls}
  516. p:=@s[index];
  517. if (nc<=127) then
  518. ConcatCharToAnsiStr(char(nc),s,index)
  519. else
  520. begin
  521. EnsureAnsiLen(s,index+MB_CUR_MAX);
  522. {$ifndef beos}
  523. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  524. {$else not beos}
  525. mblen:=wctomb(p,wchar_t(nc));
  526. {$endif not beos}
  527. if (mblen<>size_t(-1)) then
  528. inc(index,mblen)
  529. else
  530. begin
  531. { invalid wide char }
  532. p^:='?';
  533. inc(index);
  534. end;
  535. end;
  536. end;
  537. function LowerAnsiString(const s : AnsiString) : AnsiString;
  538. var
  539. i, slen,
  540. resindex : SizeInt;
  541. mblen : size_t;
  542. {$ifndef beos}
  543. ombstate,
  544. nmbstate : mbstate_t;
  545. {$endif beos}
  546. wc : wchar_t;
  547. begin
  548. {$ifndef beos}
  549. fillchar(ombstate,sizeof(ombstate),0);
  550. fillchar(nmbstate,sizeof(nmbstate),0);
  551. {$endif beos}
  552. slen:=length(s);
  553. SetLength(result,slen+10);
  554. i:=1;
  555. resindex:=1;
  556. while (i<=slen) do
  557. begin
  558. if (s[i]<=#127) then
  559. begin
  560. wc:=wchar_t(s[i]);
  561. mblen:= 1;
  562. end
  563. else
  564. {$ifndef beos}
  565. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  566. {$else not beos}
  567. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  568. {$endif not beos}
  569. case mblen of
  570. size_t(-2):
  571. begin
  572. { partial invalid character, copy literally }
  573. while (i<=slen) do
  574. begin
  575. ConcatCharToAnsiStr(s[i],result,resindex);
  576. inc(i);
  577. end;
  578. end;
  579. size_t(-1), 0:
  580. begin
  581. { invalid or null character }
  582. ConcatCharToAnsiStr(s[i],result,resindex);
  583. inc(i);
  584. end;
  585. else
  586. begin
  587. { a valid sequence }
  588. { even if mblen = 1, the lowercase version may have a }
  589. { different length }
  590. { We can't do anything special if wchar_t is 16 bit... }
  591. {$ifndef beos}
  592. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  593. {$else not beos}
  594. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  595. {$endif not beos}
  596. inc(i,mblen);
  597. end;
  598. end;
  599. end;
  600. SetLength(result,resindex-1);
  601. end;
  602. function UpperAnsiString(const s : AnsiString) : AnsiString;
  603. var
  604. i, slen,
  605. resindex : SizeInt;
  606. mblen : size_t;
  607. {$ifndef beos}
  608. ombstate,
  609. nmbstate : mbstate_t;
  610. {$endif beos}
  611. wc : wchar_t;
  612. begin
  613. {$ifndef beos}
  614. fillchar(ombstate,sizeof(ombstate),0);
  615. fillchar(nmbstate,sizeof(nmbstate),0);
  616. {$endif beos}
  617. slen:=length(s);
  618. SetLength(result,slen+10);
  619. i:=1;
  620. resindex:=1;
  621. while (i<=slen) do
  622. begin
  623. if (s[i]<=#127) then
  624. begin
  625. wc:=wchar_t(s[i]);
  626. mblen:= 1;
  627. end
  628. else
  629. {$ifndef beos}
  630. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  631. {$else not beos}
  632. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  633. {$endif beos}
  634. case mblen of
  635. size_t(-2):
  636. begin
  637. { partial invalid character, copy literally }
  638. while (i<=slen) do
  639. begin
  640. ConcatCharToAnsiStr(s[i],result,resindex);
  641. inc(i);
  642. end;
  643. end;
  644. size_t(-1), 0:
  645. begin
  646. { invalid or null character }
  647. ConcatCharToAnsiStr(s[i],result,resindex);
  648. inc(i);
  649. end;
  650. else
  651. begin
  652. { a valid sequence }
  653. { even if mblen = 1, the uppercase version may have a }
  654. { different length }
  655. { We can't do anything special if wchar_t is 16 bit... }
  656. {$ifndef beos}
  657. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  658. {$else not beos}
  659. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  660. {$endif not beos}
  661. inc(i,mblen);
  662. end;
  663. end;
  664. end;
  665. SetLength(result,resindex-1);
  666. end;
  667. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  668. var
  669. i, slen,
  670. destindex : SizeInt;
  671. uch : UCS4Char;
  672. begin
  673. slen:=length(s);
  674. setlength(result,slen+1);
  675. i:=1;
  676. destindex:=0;
  677. while (i<=slen) do
  678. begin
  679. uch:=UCS4Char(s[i]);
  680. if (uch=0) then
  681. result[destindex]:=32
  682. else if (uch<=$d7ff) or (uch>=$e000) then
  683. result[destindex]:=uch
  684. else if (uch<=$dbff) and
  685. (i<slen) and
  686. (s[i+1]>=#$dc00) and
  687. (s[i+1]<=#$dfff) then
  688. begin
  689. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  690. inc(i);
  691. end
  692. else { invalid surrogate pair }
  693. result[destindex]:=uch;
  694. inc(i);
  695. inc(destindex);
  696. end;
  697. result[destindex]:=UCS4Char(0);
  698. { Trimming length in this particular case is just a waste of time,
  699. because result will be interpreted as null-terminated and discarded
  700. almost immediately }
  701. end;
  702. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  703. {$if not(defined (aix) and defined(cpupowerpc32))}
  704. var
  705. hs1,hs2 : UCS4String;
  706. begin
  707. { wcscoll interprets null chars as end-of-string -> filter out }
  708. hs1:=WideStringToUCS4StringNoNulls(s1);
  709. hs2:=WideStringToUCS4StringNoNulls(s2);
  710. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  711. end;
  712. {$else}
  713. { AIX/PPC32 has a 16 bit wchar_t }
  714. var
  715. i, len: longint;
  716. hs1, hs2: array of widechar;
  717. begin
  718. len:=length(s1);
  719. setlength(hs1,len+1);
  720. for i:=1 to len do
  721. if s1[i]<>#0 then
  722. hs1[i-1]:=s1[i]
  723. else
  724. hs1[i-1]:=#32;
  725. hs1[len]:=#0;
  726. len:=length(s2);
  727. setlength(hs2,len+1);
  728. for i:=1 to len do
  729. if s2[i]<>#0 then
  730. hs2[i-1]:=s2[i]
  731. else
  732. hs2[i-1]:=#32;
  733. hs2[len]:=#0;
  734. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  735. end;
  736. {$endif}
  737. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  738. begin
  739. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  740. end;
  741. function CharLengthPChar(const Str: PChar): PtrInt;
  742. var
  743. nextlen: ptrint;
  744. s: pchar;
  745. {$ifndef beos}
  746. mbstate: mbstate_t;
  747. {$endif not beos}
  748. begin
  749. result:=0;
  750. s:=str;
  751. {$ifndef beos}
  752. fillchar(mbstate,sizeof(mbstate),0);
  753. {$endif not beos}
  754. repeat
  755. {$ifdef beos}
  756. nextlen:=ptrint(mblen(str,MB_CUR_MAX));
  757. {$else beos}
  758. nextlen:=ptrint(mbrlen(str,MB_CUR_MAX,@mbstate));
  759. {$endif beos}
  760. { skip invalid/incomplete sequences }
  761. if (nextlen<0) then
  762. nextlen:=1;
  763. inc(result,nextlen);
  764. inc(s,nextlen);
  765. until (nextlen=0);
  766. end;
  767. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  768. {$ifndef beos}
  769. var
  770. mbstate: mbstate_t;
  771. {$endif not beos}
  772. begin
  773. {$ifdef beos}
  774. result:=ptrint(mblen(str,maxlookahead));
  775. {$else beos}
  776. fillchar(mbstate,sizeof(mbstate),0);
  777. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  778. { mbrlen can also return -2 for "incomplete but potially valid character
  779. and data has been processed" }
  780. if result<0 then
  781. result:=-1;
  782. {$endif beos}
  783. end;
  784. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  785. var
  786. a,b: pchar;
  787. i: PtrInt;
  788. begin
  789. if not(canmodifys1) then
  790. getmem(a,len1+1)
  791. else
  792. a:=s1;
  793. for i:=0 to len1-1 do
  794. if s1[i]<>#0 then
  795. a[i]:=s1[i]
  796. else
  797. a[i]:=#32;
  798. a[len1]:=#0;
  799. if not(canmodifys2) then
  800. getmem(b,len2+1)
  801. else
  802. b:=s2;
  803. for i:=0 to len2-1 do
  804. if s2[i]<>#0 then
  805. b[i]:=s2[i]
  806. else
  807. b[i]:=#32;
  808. b[len2]:=#0;
  809. result:=strcoll(a,b);
  810. if not(canmodifys1) then
  811. freemem(a);
  812. if not(canmodifys2) then
  813. freemem(b);
  814. end;
  815. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  816. begin
  817. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  818. end;
  819. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  820. begin
  821. result:=strcoll(s1,s2);
  822. end;
  823. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  824. var
  825. a, b: AnsiString;
  826. begin
  827. a:=UpperAnsistring(s1);
  828. b:=UpperAnsistring(s2);
  829. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  830. end;
  831. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  832. begin
  833. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  834. end;
  835. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  836. var
  837. a, b: pchar;
  838. begin
  839. if (maxlen=0) then
  840. exit(0);
  841. if (s1[maxlen]<>#0) then
  842. begin
  843. getmem(a,maxlen+1);
  844. move(s1^,a^,maxlen);
  845. a[maxlen]:=#0;
  846. end
  847. else
  848. a:=s1;
  849. if (s2[maxlen]<>#0) then
  850. begin
  851. getmem(b,maxlen+1);
  852. move(s2^,b^,maxlen);
  853. b[maxlen]:=#0;
  854. end
  855. else
  856. b:=s2;
  857. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  858. if (a<>s1) then
  859. freemem(a);
  860. if (b<>s2) then
  861. freemem(b);
  862. end;
  863. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  864. var
  865. a, b: ansistring;
  866. begin
  867. if (maxlen=0) then
  868. exit(0);
  869. setlength(a,maxlen);
  870. move(s1^,a[1],maxlen);
  871. setlength(b,maxlen);
  872. move(s2^,b[1],maxlen);
  873. result:=AnsiCompareText(a,b);
  874. end;
  875. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  876. var
  877. newlen: sizeint;
  878. begin
  879. newlen:=length(s);
  880. if newlen>strlen(orgp) then
  881. fpc_rangeerror;
  882. p:=orgp;
  883. if (newlen>0) then
  884. move(s[1],p[0],newlen);
  885. p[newlen]:=#0;
  886. end;
  887. function AnsiStrLower(Str: PChar): PChar;
  888. var
  889. temp: ansistring;
  890. begin
  891. temp:=loweransistring(str);
  892. ansi2pchar(temp,str,result);
  893. end;
  894. function AnsiStrUpper(Str: PChar): PChar;
  895. var
  896. temp: ansistring;
  897. begin
  898. temp:=upperansistring(str);
  899. ansi2pchar(temp,str,result);
  900. end;
  901. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  902. var
  903. langinfo: pchar;
  904. begin
  905. {$ifdef FPCRTL_FILESYSTEM_UTF8}
  906. if stdcp=scpFileSystemSingleByte then
  907. begin
  908. result:=CP_UTF8;
  909. exit;
  910. end;
  911. {$endif}
  912. langinfo:=nl_langinfo(CODESET);
  913. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  914. that causes it to return an empty string of UTF-8 locales
  915. -> patch up (and in general, UTF-8 is a good default on
  916. Unix platforms) }
  917. if not assigned(langinfo) or
  918. (langinfo^=#0) then
  919. langinfo:='UTF-8';
  920. Result := GetCodepageByName(ansistring(langinfo));
  921. end;
  922. {$ifdef FPC_HAS_CPSTRING}
  923. procedure SetStdIOCodePage(var T: Text); inline;
  924. begin
  925. case TextRec(T).Mode of
  926. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  927. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  928. end;
  929. end;
  930. procedure SetStdIOCodePages; inline;
  931. begin
  932. SetStdIOCodePage(Input);
  933. SetStdIOCodePage(Output);
  934. SetStdIOCodePage(ErrOutput);
  935. SetStdIOCodePage(StdOut);
  936. SetStdIOCodePage(StdErr);
  937. end;
  938. {$endif FPC_HAS_CPSTRING}
  939. var
  940. OrgWideStringManager: TUnicodeStringManager;
  941. Procedure SetCWideStringManager;
  942. Var
  943. CWideStringManager : TUnicodeStringManager;
  944. begin
  945. GetUnicodeStringManager(OrgWideStringManager);
  946. CWideStringManager:=OrgWideStringManager;
  947. With CWideStringManager do
  948. begin
  949. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  950. Ansi2WideMoveProc:=@Ansi2WideMove;
  951. UpperWideStringProc:=@UpperWideString;
  952. LowerWideStringProc:=@LowerWideString;
  953. CompareWideStringProc:=@CompareWideString;
  954. CompareTextWideStringProc:=@CompareTextWideString;
  955. CharLengthPCharProc:=@CharLengthPChar;
  956. CodePointLengthProc:=@CodePointLength;
  957. UpperAnsiStringProc:=@UpperAnsiString;
  958. LowerAnsiStringProc:=@LowerAnsiString;
  959. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  960. CompareTextAnsiStringProc:=@AnsiCompareText;
  961. StrCompAnsiStringProc:=@StrCompAnsi;
  962. StrICompAnsiStringProc:=@AnsiStrIComp;
  963. StrLCompAnsiStringProc:=@AnsiStrLComp;
  964. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  965. StrLowerAnsiStringProc:=@AnsiStrLower;
  966. StrUpperAnsiStringProc:=@AnsiStrUpper;
  967. ThreadInitProc:=@InitThread;
  968. ThreadFiniProc:=@FiniThread;
  969. { Unicode }
  970. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  971. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  972. UpperUnicodeStringProc:=@UpperWideString;
  973. LowerUnicodeStringProc:=@LowerWideString;
  974. CompareUnicodeStringProc:=@CompareWideString;
  975. CompareTextUnicodeStringProc:=@CompareTextWideString;
  976. { CodePage }
  977. GetStandardCodePageProc:=@GetStandardCodePage;
  978. end;
  979. SetUnicodeStringManager(CWideStringManager);
  980. end;
  981. var
  982. iconvlib:TLibHandle;
  983. initialization
  984. SetCWideStringManager;
  985. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  986. { with the information from the environment variables according to POSIX }
  987. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  988. setlocale(LC_ALL,'');
  989. { load iconvctl function }
  990. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  991. if iconvlib<>0 then
  992. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  993. { set the DefaultSystemCodePage }
  994. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  995. DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
  996. DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
  997. {$ifdef FPC_HAS_CPSTRING}
  998. SetStdIOCodePages;
  999. {$endif FPC_HAS_CPSTRING}
  1000. { init conversion tables for main program }
  1001. InitThread;
  1002. finalization
  1003. { fini conversion tables for main program }
  1004. FiniThread;
  1005. { unload iconv library }
  1006. if iconvlib<>0 then
  1007. FreeLibrary(iconvlib);
  1008. { restore previous (probably default) widestring manager so that subsequent calls
  1009. into the widestring manager won't trigger the finalized functionality }
  1010. SetWideStringManager(OrgWideStringManager);
  1011. end.