cwstring.pp 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(linux) or defined(solaris)}
  22. {$define iconv_is_in_libc}
  23. {$endif}
  24. {$ifdef netbsd}
  25. {$ifdef TEST_ICONV_LIBC}
  26. {$define iconv_is_in_libc}
  27. {$endif}
  28. {$endif}
  29. {$ifndef iconv_is_in_libc}
  30. {$if defined(haiku)}
  31. {$linklib textencoding}
  32. {$else}
  33. {$linklib iconv}
  34. {$endif}
  35. {$define useiconv}
  36. {$endif not iconv_is_in_libc}
  37. Uses
  38. BaseUnix,
  39. ctypes,
  40. unix,
  41. unixtype,
  42. initc,
  43. dynlibs,
  44. unixcp;
  45. Const
  46. {$ifndef useiconv}
  47. libiconvname='c'; // is in libc under Linux.
  48. libprefix='lib';
  49. {$else}
  50. {$ifdef haiku}
  51. libiconvname='textencoding'; // is in libtextencoding under Haiku
  52. libprefix='lib';
  53. {$else}
  54. {$if defined(darwin) or defined(aix)}
  55. libiconvname='libiconv';
  56. libprefix='';
  57. {$else}
  58. libiconvname='iconv';
  59. libprefix='lib';
  60. {$endif}
  61. {$endif}
  62. {$endif}
  63. { helper functions from libc }
  64. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  65. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  66. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  67. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  68. {$ifdef netbsd}
  69. { NetBSD has a new setlocale function defined in /usr/include/locale.h
  70. that should be used }
  71. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name '__setlocale_mb_len_max_32';
  72. {$else}
  73. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  74. {$endif}
  75. {$ifndef beos}
  76. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  77. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  78. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  79. {$else beos}
  80. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  81. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  82. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  83. {$endif beos}
  84. const
  85. {$if defined(linux)}
  86. __LC_CTYPE = 0;
  87. LC_ALL = 6;
  88. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  89. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  90. CODESET = _NL_CTYPE_CODESET_NAME;
  91. {$elseif defined(darwin)}
  92. CODESET = 0;
  93. LC_ALL = 0;
  94. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  95. __LC_CTYPE = 0;
  96. LC_ALL = 0;
  97. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  98. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  99. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  100. {$elseif defined(solaris)}
  101. {$define ACCEPT_646}
  102. CODESET=49;
  103. LC_ALL = 6;
  104. {$elseif defined(beos)}
  105. {$warning check correct value for BeOS}
  106. CODESET=49;
  107. {$ifdef haiku}
  108. LC_ALL = 0; // Checked for Haiku
  109. {$else}
  110. LC_ALL = 6; // Checked for BeOS
  111. {$endif}
  112. ESysEILSEQ = EILSEQ;
  113. {$elseif defined(OpenBSD)}
  114. {$define ACCEPT_646}
  115. CODESET = 51;
  116. LC_ALL = 0;
  117. {$elseif defined(NetBSD)}
  118. {$define ACCEPT_646}
  119. CODESET = 51;
  120. LC_ALL = 0;
  121. {$elseif defined(aix)}
  122. CODESET = 49;
  123. LC_ALL = -1;
  124. {$else not aix}
  125. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  126. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  127. {$endif}
  128. { unicode encoding name }
  129. {$ifdef FPC_LITTLE_ENDIAN}
  130. unicode_encoding2 = 'UTF-16LE';
  131. unicode_encoding4 = 'UCS-4LE';
  132. {$else FPC_LITTLE_ENDIAN}
  133. {$ifdef AIX}
  134. unicode_encoding2 = 'UTF-16';
  135. {$else AIX}
  136. unicode_encoding2 = 'UTF-16BE';
  137. unicode_encoding4 = 'UCS-4BE';
  138. {$endif AIX}
  139. {$endif FPC_LITTLE_ENDIAN}
  140. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  141. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  142. { that's a libc macro mapped to internal functions/variables }
  143. { and thus not a stable external API on systems where libc }
  144. { breaks backwards compatibility every now and then }
  145. MB_CUR_MAX = 10;
  146. { Requests for iconvctl }
  147. ICONV_TRIVIALP = 0; // int *argument
  148. ICONV_GET_TRANSLITERATE = 1; // int *argument
  149. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  150. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  151. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  152. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  153. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  154. type
  155. piconv_t = ^iconv_t;
  156. iconv_t = pointer;
  157. nl_item = cint;
  158. {$ifdef haiku}
  159. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'root' name 'nl_langinfo';
  160. {$else}
  161. {$ifndef beos}
  162. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  163. {$endif}
  164. {$endif}
  165. {$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
  166. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  167. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  168. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  169. const
  170. iconvctlname='iconvctl';
  171. {$else}
  172. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  173. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  174. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  175. const
  176. iconvctlname='libiconvctl';
  177. {$endif}
  178. var
  179. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  180. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  181. threadvar
  182. iconv_ansi2wide,
  183. iconv_wide2ansi : iconv_t;
  184. { since we cache the iconv_t converters, we have to do the same
  185. for the DefaultSystemCodePage variable since if it changes, we
  186. have to re-initialize the converters too. We can't do that via
  187. a callback in the widestring manager because DefaultSystemCodePage
  188. is not a threadvar and we can't automatically change this in all
  189. threads }
  190. current_DefaultSystemCodePage: TSystemCodePage;
  191. procedure InitThread;
  192. var
  193. transliterate: cint;
  194. iconvindex: longint;
  195. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  196. iconvname: rawbytestring;
  197. {$endif}
  198. begin
  199. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  200. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  201. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  202. if iconvindex<>-1 then
  203. iconvname:=UnixCpMap[iconvindex].name
  204. else
  205. { default to UTF-8 on Unix platforms }
  206. iconvname:='UTF-8';
  207. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  208. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  209. {$else}
  210. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  211. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  212. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  213. {$endif}
  214. if assigned(iconvctl) and
  215. (iconv_wide2ansi<>iconv_t(-1)) then
  216. begin
  217. transliterate:=1;
  218. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  219. end;
  220. end;
  221. procedure FiniThread;
  222. begin
  223. if (iconv_wide2ansi <> iconv_t(-1)) then
  224. iconv_close(iconv_wide2ansi);
  225. if (iconv_ansi2wide <> iconv_t(-1)) then
  226. iconv_close(iconv_ansi2wide);
  227. end;
  228. {$if defined(beos) and not defined(haiku)}
  229. function nl_langinfo(__item:nl_item):pchar;
  230. begin
  231. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  232. // Now implement the minimum required to correctly initialize WideString support
  233. case __item of
  234. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  235. else
  236. begin
  237. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  238. Result := '';
  239. end;
  240. end;
  241. end;
  242. {$endif}
  243. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
  244. var
  245. iconvindex: longint;
  246. begin
  247. { TODO: add caching (then we also don't need separate code for
  248. the default system page and other ones)
  249. -- typecasting an ansistring function result to pchar is
  250. unsafe normally, but these are constant strings -> no
  251. problem }
  252. open_iconv_for_cps:=iconv_t(-1);
  253. iconvindex:=GetCodepageData(cp);
  254. if iconvindex=-1 then
  255. exit;
  256. repeat
  257. if cp_is_from then
  258. open_iconv_for_cps:=iconv_open(otherencoding,pchar(UnixCpMap[iconvindex].name))
  259. else
  260. open_iconv_for_cps:=iconv_open(pchar(UnixCpMap[iconvindex].name),otherencoding);
  261. inc(iconvindex);
  262. until (open_iconv_for_cps<>iconv_t(-1)) or
  263. (iconvindex>high(UnixCpMap)) or
  264. (UnixCpMap[iconvindex].cp<>cp);
  265. end;
  266. {$ifdef aix}
  267. {$i cwstraix.inc}
  268. {$endif aix}
  269. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  270. var
  271. outlength,
  272. outoffset,
  273. srclen,
  274. outleft : size_t;
  275. use_iconv: iconv_t;
  276. srcpos : pwidechar;
  277. destpos: pchar;
  278. mynil : pchar;
  279. my0 : size_t;
  280. err : longint;
  281. transliterate: cint;
  282. free_iconv: boolean;
  283. {$ifdef aix}
  284. intermediate: rawbytestring;
  285. {$endif aix}
  286. begin
  287. {$ifdef aix}
  288. { AIX libiconv does not support converting cp866 to anything else except
  289. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  290. if cp=866 then
  291. begin
  292. Wide2AnsiMove(source,intermediate,28595,len);
  293. if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
  294. exit;
  295. end;
  296. {$endif aix}
  297. if (cp=DefaultSystemCodePage) then
  298. begin
  299. { update iconv converter in case the DefaultSystemCodePage has been
  300. changed }
  301. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  302. begin
  303. FiniThread;
  304. InitThread;
  305. end;
  306. use_iconv:=iconv_wide2ansi;
  307. free_iconv:=false;
  308. end
  309. else
  310. begin
  311. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  312. if (use_iconv<>iconv_t(-1)) and
  313. assigned(iconvctl) then
  314. begin
  315. transliterate:=1;
  316. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  317. end;
  318. free_iconv:=true;
  319. end;
  320. { unsupported encoding -> default move }
  321. if use_iconv=iconv_t(-1) then
  322. begin
  323. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  324. exit;
  325. end;
  326. mynil:=nil;
  327. my0:=0;
  328. { rought estimation }
  329. setlength(dest,len*3);
  330. outlength:=len*3;
  331. srclen:=len*2;
  332. srcpos:=source;
  333. destpos:=pchar(dest);
  334. outleft:=outlength;
  335. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  336. begin
  337. err:=fpgetCerrno;
  338. case err of
  339. { last character is incomplete sequence }
  340. ESysEINVAL,
  341. { incomplete sequence in the middle }
  342. ESysEILSEQ:
  343. begin
  344. { skip and set to '?' }
  345. inc(srcpos);
  346. dec(srclen,2);
  347. destpos^:='?';
  348. inc(destpos);
  349. dec(outleft);
  350. { reset }
  351. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  352. if err=ESysEINVAL then
  353. break;
  354. end;
  355. ESysE2BIG:
  356. begin
  357. outoffset:=destpos-pchar(dest);
  358. { extend }
  359. setlength(dest,outlength+len*3);
  360. inc(outleft,len*3);
  361. inc(outlength,len*3);
  362. { string could have been moved }
  363. destpos:=pchar(dest)+outoffset;
  364. end;
  365. else
  366. runerror(231);
  367. end;
  368. end;
  369. // truncate string
  370. setlength(dest,length(dest)-outleft);
  371. SetCodePage(dest,cp,false);
  372. if free_iconv then
  373. iconv_close(use_iconv);
  374. end;
  375. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  376. var
  377. outlength,
  378. outoffset,
  379. outleft : size_t;
  380. use_iconv: iconv_t;
  381. srcpos,
  382. destpos: pchar;
  383. mynil : pchar;
  384. my0 : size_t;
  385. err: cint;
  386. iconvindex: longint;
  387. free_iconv: boolean;
  388. {$ifdef aix}
  389. intermediate: rawbytestring;
  390. {$endif aix}
  391. begin
  392. {$ifdef aix}
  393. { AIX libiconv does not support converting cp866 to anything else except
  394. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  395. if cp=866 then
  396. begin
  397. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  398. source:=pchar(intermediate);
  399. end;
  400. {$endif aix}
  401. if (cp=DefaultSystemCodePage) then
  402. begin
  403. { update iconv converter in case the DefaultSystemCodePage has been
  404. changed }
  405. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  406. begin
  407. FiniThread;
  408. InitThread;
  409. end;
  410. use_iconv:=iconv_ansi2wide;
  411. free_iconv:=false;
  412. end
  413. else
  414. begin
  415. { TODO: add caching (then we also don't need separate code for
  416. the default system page and other ones)
  417. -- typecasting an ansistring function result to pchar is
  418. unsafe normally, but these are constant strings -> no
  419. problem }
  420. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  421. free_iconv:=true;
  422. end;
  423. { unsupported encoding -> default move }
  424. if use_iconv=iconv_t(-1) then
  425. begin
  426. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  427. exit;
  428. end;
  429. mynil:=nil;
  430. my0:=0;
  431. // extra space
  432. outlength:=len+1;
  433. setlength(dest,outlength);
  434. srcpos:=source;
  435. destpos:=pchar(dest);
  436. outleft:=outlength*2;
  437. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  438. begin
  439. err:=fpgetCerrno;
  440. case err of
  441. ESysEINVAL,
  442. ESysEILSEQ:
  443. begin
  444. { skip and set to '?' }
  445. inc(srcpos);
  446. dec(len);
  447. pwidechar(destpos)^:='?';
  448. inc(destpos,2);
  449. dec(outleft,2);
  450. { reset }
  451. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  452. if err=ESysEINVAL then
  453. break;
  454. end;
  455. ESysE2BIG:
  456. begin
  457. outoffset:=destpos-pchar(dest);
  458. { extend }
  459. setlength(dest,outlength+len);
  460. inc(outleft,len*2);
  461. inc(outlength,len);
  462. { string could have been moved }
  463. destpos:=pchar(dest)+outoffset;
  464. end;
  465. else
  466. runerror(231);
  467. end;
  468. end;
  469. // truncate string
  470. setlength(dest,length(dest)-outleft div 2);
  471. if free_iconv then
  472. iconv_close(use_iconv);
  473. end;
  474. function LowerWideString(const s : WideString) : WideString;
  475. var
  476. i : SizeInt;
  477. begin
  478. SetLength(result,length(s));
  479. for i:=0 to length(s)-1 do
  480. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  481. end;
  482. function UpperWideString(const s : WideString) : WideString;
  483. var
  484. i : SizeInt;
  485. begin
  486. SetLength(result,length(s));
  487. for i:=0 to length(s)-1 do
  488. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  489. end;
  490. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  491. begin
  492. if (len>length(s)) then
  493. if (length(s) < 10*256) then
  494. setlength(s,length(s)+10)
  495. else
  496. setlength(s,length(s)+length(s) shr 8);
  497. end;
  498. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  499. begin
  500. EnsureAnsiLen(s,index);
  501. pchar(@s[index])^:=c;
  502. inc(index);
  503. end;
  504. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  505. {$ifndef beos}
  506. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  507. {$else not beos}
  508. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  509. {$endif beos}
  510. var
  511. p : pchar;
  512. mblen : size_t;
  513. begin
  514. { we know that s is unique -> avoid uniquestring calls}
  515. p:=@s[index];
  516. if (nc<=127) then
  517. ConcatCharToAnsiStr(char(nc),s,index)
  518. else
  519. begin
  520. EnsureAnsiLen(s,index+MB_CUR_MAX);
  521. {$ifndef beos}
  522. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  523. {$else not beos}
  524. mblen:=wctomb(p,wchar_t(nc));
  525. {$endif not beos}
  526. if (mblen<>size_t(-1)) then
  527. inc(index,mblen)
  528. else
  529. begin
  530. { invalid wide char }
  531. p^:='?';
  532. inc(index);
  533. end;
  534. end;
  535. end;
  536. function LowerAnsiString(const s : AnsiString) : AnsiString;
  537. var
  538. i, slen,
  539. resindex : SizeInt;
  540. mblen : size_t;
  541. {$ifndef beos}
  542. ombstate,
  543. nmbstate : mbstate_t;
  544. {$endif beos}
  545. wc : wchar_t;
  546. begin
  547. {$ifndef beos}
  548. fillchar(ombstate,sizeof(ombstate),0);
  549. fillchar(nmbstate,sizeof(nmbstate),0);
  550. {$endif beos}
  551. slen:=length(s);
  552. SetLength(result,slen+10);
  553. i:=1;
  554. resindex:=1;
  555. while (i<=slen) do
  556. begin
  557. if (s[i]<=#127) then
  558. begin
  559. wc:=wchar_t(s[i]);
  560. mblen:= 1;
  561. end
  562. else
  563. {$ifndef beos}
  564. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  565. {$else not beos}
  566. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  567. {$endif not beos}
  568. case mblen of
  569. size_t(-2):
  570. begin
  571. { partial invalid character, copy literally }
  572. while (i<=slen) do
  573. begin
  574. ConcatCharToAnsiStr(s[i],result,resindex);
  575. inc(i);
  576. end;
  577. end;
  578. size_t(-1), 0:
  579. begin
  580. { invalid or null character }
  581. ConcatCharToAnsiStr(s[i],result,resindex);
  582. inc(i);
  583. end;
  584. else
  585. begin
  586. { a valid sequence }
  587. { even if mblen = 1, the lowercase version may have a }
  588. { different length }
  589. { We can't do anything special if wchar_t is 16 bit... }
  590. {$ifndef beos}
  591. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  592. {$else not beos}
  593. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  594. {$endif not beos}
  595. inc(i,mblen);
  596. end;
  597. end;
  598. end;
  599. SetLength(result,resindex-1);
  600. end;
  601. function UpperAnsiString(const s : AnsiString) : AnsiString;
  602. var
  603. i, slen,
  604. resindex : SizeInt;
  605. mblen : size_t;
  606. {$ifndef beos}
  607. ombstate,
  608. nmbstate : mbstate_t;
  609. {$endif beos}
  610. wc : wchar_t;
  611. begin
  612. {$ifndef beos}
  613. fillchar(ombstate,sizeof(ombstate),0);
  614. fillchar(nmbstate,sizeof(nmbstate),0);
  615. {$endif beos}
  616. slen:=length(s);
  617. SetLength(result,slen+10);
  618. i:=1;
  619. resindex:=1;
  620. while (i<=slen) do
  621. begin
  622. if (s[i]<=#127) then
  623. begin
  624. wc:=wchar_t(s[i]);
  625. mblen:= 1;
  626. end
  627. else
  628. {$ifndef beos}
  629. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  630. {$else not beos}
  631. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  632. {$endif beos}
  633. case mblen of
  634. size_t(-2):
  635. begin
  636. { partial invalid character, copy literally }
  637. while (i<=slen) do
  638. begin
  639. ConcatCharToAnsiStr(s[i],result,resindex);
  640. inc(i);
  641. end;
  642. end;
  643. size_t(-1), 0:
  644. begin
  645. { invalid or null character }
  646. ConcatCharToAnsiStr(s[i],result,resindex);
  647. inc(i);
  648. end;
  649. else
  650. begin
  651. { a valid sequence }
  652. { even if mblen = 1, the uppercase version may have a }
  653. { different length }
  654. { We can't do anything special if wchar_t is 16 bit... }
  655. {$ifndef beos}
  656. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  657. {$else not beos}
  658. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  659. {$endif not beos}
  660. inc(i,mblen);
  661. end;
  662. end;
  663. end;
  664. SetLength(result,resindex-1);
  665. end;
  666. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  667. var
  668. i, slen,
  669. destindex : SizeInt;
  670. uch : UCS4Char;
  671. begin
  672. slen:=length(s);
  673. setlength(result,slen+1);
  674. i:=1;
  675. destindex:=0;
  676. while (i<=slen) do
  677. begin
  678. uch:=UCS4Char(s[i]);
  679. if (uch=0) then
  680. result[destindex]:=32
  681. else if (uch<=$d7ff) or (uch>=$e000) then
  682. result[destindex]:=uch
  683. else if (uch<=$dbff) and
  684. (i<slen) and
  685. (s[i+1]>=#$dc00) and
  686. (s[i+1]<=#$dfff) then
  687. begin
  688. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  689. inc(i);
  690. end
  691. else { invalid surrogate pair }
  692. result[destindex]:=uch;
  693. inc(i);
  694. inc(destindex);
  695. end;
  696. result[destindex]:=UCS4Char(0);
  697. { Trimming length in this particular case is just a waste of time,
  698. because result will be interpreted as null-terminated and discarded
  699. almost immediately }
  700. end;
  701. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  702. {$if not(defined (aix) and defined(cpupowerpc32))}
  703. var
  704. hs1,hs2 : UCS4String;
  705. begin
  706. { wcscoll interprets null chars as end-of-string -> filter out }
  707. hs1:=WideStringToUCS4StringNoNulls(s1);
  708. hs2:=WideStringToUCS4StringNoNulls(s2);
  709. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  710. end;
  711. {$else}
  712. { AIX/PPC32 has a 16 bit wchar_t }
  713. var
  714. i, len: longint;
  715. hs1, hs2: array of widechar;
  716. begin
  717. len:=length(s1);
  718. setlength(hs1,len+1);
  719. for i:=1 to len do
  720. if s1[i]<>#0 then
  721. hs1[i-1]:=s1[i]
  722. else
  723. hs1[i-1]:=#32;
  724. hs1[len]:=#0;
  725. len:=length(s2);
  726. setlength(hs2,len+1);
  727. for i:=1 to len do
  728. if s2[i]<>#0 then
  729. hs2[i-1]:=s2[i]
  730. else
  731. hs2[i-1]:=#32;
  732. hs2[len]:=#0;
  733. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  734. end;
  735. {$endif}
  736. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  737. begin
  738. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  739. end;
  740. function CharLengthPChar(const Str: PChar): PtrInt;
  741. var
  742. nextlen: ptrint;
  743. s: pchar;
  744. {$ifndef beos}
  745. mbstate: mbstate_t;
  746. {$endif not beos}
  747. begin
  748. result:=0;
  749. s:=str;
  750. {$ifndef beos}
  751. fillchar(mbstate,sizeof(mbstate),0);
  752. {$endif not beos}
  753. repeat
  754. {$ifdef beos}
  755. nextlen:=ptrint(mblen(str,MB_CUR_MAX));
  756. {$else beos}
  757. nextlen:=ptrint(mbrlen(str,MB_CUR_MAX,@mbstate));
  758. {$endif beos}
  759. { skip invalid/incomplete sequences }
  760. if (nextlen<0) then
  761. nextlen:=1;
  762. inc(result,nextlen);
  763. inc(s,nextlen);
  764. until (nextlen=0);
  765. end;
  766. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  767. {$ifndef beos}
  768. var
  769. mbstate: mbstate_t;
  770. {$endif not beos}
  771. begin
  772. {$ifdef beos}
  773. result:=ptrint(mblen(str,maxlookahead));
  774. {$else beos}
  775. fillchar(mbstate,sizeof(mbstate),0);
  776. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  777. { mbrlen can also return -2 for "incomplete but potially valid character
  778. and data has been processed" }
  779. if result<0 then
  780. result:=-1;
  781. {$endif beos}
  782. end;
  783. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  784. var
  785. a,b: pchar;
  786. i: PtrInt;
  787. begin
  788. if not(canmodifys1) then
  789. getmem(a,len1+1)
  790. else
  791. a:=s1;
  792. for i:=0 to len1-1 do
  793. if s1[i]<>#0 then
  794. a[i]:=s1[i]
  795. else
  796. a[i]:=#32;
  797. a[len1]:=#0;
  798. if not(canmodifys2) then
  799. getmem(b,len2+1)
  800. else
  801. b:=s2;
  802. for i:=0 to len2-1 do
  803. if s2[i]<>#0 then
  804. b[i]:=s2[i]
  805. else
  806. b[i]:=#32;
  807. b[len2]:=#0;
  808. result:=strcoll(a,b);
  809. if not(canmodifys1) then
  810. freemem(a);
  811. if not(canmodifys2) then
  812. freemem(b);
  813. end;
  814. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  815. begin
  816. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  817. end;
  818. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  819. begin
  820. result:=strcoll(s1,s2);
  821. end;
  822. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  823. var
  824. a, b: AnsiString;
  825. begin
  826. a:=UpperAnsistring(s1);
  827. b:=UpperAnsistring(s2);
  828. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  829. end;
  830. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  831. begin
  832. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  833. end;
  834. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  835. var
  836. a, b: pchar;
  837. begin
  838. if (maxlen=0) then
  839. exit(0);
  840. if (s1[maxlen]<>#0) then
  841. begin
  842. getmem(a,maxlen+1);
  843. move(s1^,a^,maxlen);
  844. a[maxlen]:=#0;
  845. end
  846. else
  847. a:=s1;
  848. if (s2[maxlen]<>#0) then
  849. begin
  850. getmem(b,maxlen+1);
  851. move(s2^,b^,maxlen);
  852. b[maxlen]:=#0;
  853. end
  854. else
  855. b:=s2;
  856. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  857. if (a<>s1) then
  858. freemem(a);
  859. if (b<>s2) then
  860. freemem(b);
  861. end;
  862. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  863. var
  864. a, b: ansistring;
  865. begin
  866. if (maxlen=0) then
  867. exit(0);
  868. setlength(a,maxlen);
  869. move(s1^,a[1],maxlen);
  870. setlength(b,maxlen);
  871. move(s2^,b[1],maxlen);
  872. result:=AnsiCompareText(a,b);
  873. end;
  874. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  875. var
  876. newlen: sizeint;
  877. begin
  878. newlen:=length(s);
  879. if newlen>strlen(orgp) then
  880. fpc_rangeerror;
  881. p:=orgp;
  882. if (newlen>0) then
  883. move(s[1],p[0],newlen);
  884. p[newlen]:=#0;
  885. end;
  886. function AnsiStrLower(Str: PChar): PChar;
  887. var
  888. temp: ansistring;
  889. begin
  890. temp:=loweransistring(str);
  891. ansi2pchar(temp,str,result);
  892. end;
  893. function AnsiStrUpper(Str: PChar): PChar;
  894. var
  895. temp: ansistring;
  896. begin
  897. temp:=upperansistring(str);
  898. ansi2pchar(temp,str,result);
  899. end;
  900. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  901. var
  902. langinfo: pchar;
  903. begin
  904. langinfo:=nl_langinfo(CODESET);
  905. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  906. that causes it to return an empty string of UTF-8 locales
  907. -> patch up (and in general, UTF-8 is a good default on
  908. Unix platforms) }
  909. if not assigned(langinfo) or
  910. (langinfo^=#0) then
  911. langinfo:='UTF-8';
  912. Result := GetCodepageByName(ansistring(langinfo));
  913. end;
  914. {$ifdef FPC_HAS_CPSTRING}
  915. {$i textrec.inc}
  916. procedure SetStdIOCodePage(var T: Text); inline;
  917. begin
  918. case TextRec(T).Mode of
  919. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  920. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  921. end;
  922. end;
  923. procedure SetStdIOCodePages; inline;
  924. begin
  925. SetStdIOCodePage(Input);
  926. SetStdIOCodePage(Output);
  927. SetStdIOCodePage(ErrOutput);
  928. SetStdIOCodePage(StdOut);
  929. SetStdIOCodePage(StdErr);
  930. end;
  931. {$endif FPC_HAS_CPSTRING}
  932. Procedure SetCWideStringManager;
  933. Var
  934. CWideStringManager : TUnicodeStringManager;
  935. begin
  936. CWideStringManager:=widestringmanager;
  937. With CWideStringManager do
  938. begin
  939. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  940. Ansi2WideMoveProc:=@Ansi2WideMove;
  941. UpperWideStringProc:=@UpperWideString;
  942. LowerWideStringProc:=@LowerWideString;
  943. CompareWideStringProc:=@CompareWideString;
  944. CompareTextWideStringProc:=@CompareTextWideString;
  945. CharLengthPCharProc:=@CharLengthPChar;
  946. CodePointLengthProc:=@CodePointLength;
  947. UpperAnsiStringProc:=@UpperAnsiString;
  948. LowerAnsiStringProc:=@LowerAnsiString;
  949. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  950. CompareTextAnsiStringProc:=@AnsiCompareText;
  951. StrCompAnsiStringProc:=@StrCompAnsi;
  952. StrICompAnsiStringProc:=@AnsiStrIComp;
  953. StrLCompAnsiStringProc:=@AnsiStrLComp;
  954. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  955. StrLowerAnsiStringProc:=@AnsiStrLower;
  956. StrUpperAnsiStringProc:=@AnsiStrUpper;
  957. ThreadInitProc:=@InitThread;
  958. ThreadFiniProc:=@FiniThread;
  959. { Unicode }
  960. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  961. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  962. UpperUnicodeStringProc:=@UpperWideString;
  963. LowerUnicodeStringProc:=@LowerWideString;
  964. CompareUnicodeStringProc:=@CompareWideString;
  965. CompareTextUnicodeStringProc:=@CompareTextWideString;
  966. { CodePage }
  967. GetStandardCodePageProc:=@GetStandardCodePage;
  968. end;
  969. SetUnicodeStringManager(CWideStringManager);
  970. end;
  971. var
  972. iconvlib:TLibHandle;
  973. initialization
  974. SetCWideStringManager;
  975. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  976. { with the information from the environment variables according to POSIX }
  977. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  978. setlocale(LC_ALL,'');
  979. { load iconvctl function }
  980. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  981. if iconvlib<>0 then
  982. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  983. { set the DefaultSystemCodePage }
  984. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  985. {$ifdef FPC_HAS_CPSTRING}
  986. SetStdIOCodePages;
  987. {$endif FPC_HAS_CPSTRING}
  988. { init conversion tables for main program }
  989. InitThread;
  990. finalization
  991. { fini conversion tables for main program }
  992. FiniThread;
  993. { unload iconv library }
  994. if iconvlib<>0 then
  995. FreeLibrary(iconvlib);
  996. end.