cwstring.pp 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. {$if not defined(linux) and not defined(solaris)} // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(haiku)}
  22. {$linklib textencoding}
  23. {$linklib locale}
  24. {$else}
  25. {$linklib iconv}
  26. {$endif}
  27. {$define useiconv}
  28. {$endif linux}
  29. Uses
  30. BaseUnix,
  31. ctypes,
  32. unix,
  33. unixtype,
  34. initc,
  35. dynlibs;
  36. Const
  37. {$ifndef useiconv}
  38. libiconvname='c'; // is in libc under Linux.
  39. libprefix='lib';
  40. {$else}
  41. {$ifdef haiku}
  42. libiconvname='textencoding'; // is in libtextencoding under Haiku
  43. libprefix='lib';
  44. {$else}
  45. {$if defined(darwin) or defined(aix)}
  46. libiconvname='libiconv';
  47. libprefix='';
  48. {$else}
  49. libiconvname='iconv';
  50. libprefix='lib';
  51. {$endif}
  52. {$endif}
  53. {$endif}
  54. { helper functions from libc }
  55. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  56. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  57. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  58. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  59. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  60. {$ifndef beos}
  61. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  62. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  63. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  64. {$else beos}
  65. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  66. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  67. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  68. {$endif beos}
  69. const
  70. {$if defined(linux)}
  71. __LC_CTYPE = 0;
  72. LC_ALL = 6;
  73. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  74. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  75. CODESET = _NL_CTYPE_CODESET_NAME;
  76. {$elseif defined(darwin)}
  77. CODESET = 0;
  78. LC_ALL = 0;
  79. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  80. __LC_CTYPE = 0;
  81. LC_ALL = 0;
  82. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  83. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  84. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  85. {$elseif defined(solaris)}
  86. CODESET=49;
  87. LC_ALL = 6;
  88. {$elseif defined(beos)}
  89. {$warning check correct value for BeOS}
  90. CODESET=49;
  91. {$ifdef haiku}
  92. LC_ALL = 0; // Checked for Haiku
  93. {$else}
  94. LC_ALL = 6; // Checked for BeOS
  95. {$endif}
  96. ESysEILSEQ = EILSEQ;
  97. {$elseif defined(OpenBSD)}
  98. CODESET = 51;
  99. LC_ALL = 0;
  100. {$elseif defined(NetBSD)}
  101. CODESET = 51;
  102. LC_ALL = 0;
  103. {$elseif defined(aix)}
  104. CODESET = 49;
  105. LC_ALL = -1;
  106. {$else not aix}
  107. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  108. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  109. {$endif}
  110. { unicode encoding name }
  111. {$ifdef FPC_LITTLE_ENDIAN}
  112. unicode_encoding2 = 'UTF-16LE';
  113. unicode_encoding4 = 'UCS-4LE';
  114. {$else FPC_LITTLE_ENDIAN}
  115. {$ifdef AIX}
  116. unicode_encoding2 = 'UTF-16';
  117. {$else AIX}
  118. unicode_encoding2 = 'UTF-16BE';
  119. unicode_encoding4 = 'UCS-4BE';
  120. {$endif AIX}
  121. {$endif FPC_LITTLE_ENDIAN}
  122. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  123. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  124. { that's a libc macro mapped to internal functions/variables }
  125. { and thus not a stable external API on systems where libc }
  126. { breaks backwards compatibility every now and then }
  127. MB_CUR_MAX = 10;
  128. { Requests for iconvctl }
  129. ICONV_TRIVIALP = 0; // int *argument
  130. ICONV_GET_TRANSLITERATE = 1; // int *argument
  131. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  132. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  133. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  134. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  135. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  136. type
  137. piconv_t = ^iconv_t;
  138. iconv_t = pointer;
  139. nl_item = cint;
  140. {$ifdef haiku}
  141. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'locale' name 'nl_langinfo';
  142. {$else}
  143. {$ifndef beos}
  144. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  145. {$endif}
  146. {$endif}
  147. {$if (not defined(bsd) and not defined(beos)) or (defined(darwin) and not defined(cpupowerpc32))}
  148. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  149. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  150. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  151. const
  152. iconvctlname='iconvctl';
  153. {$else}
  154. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  155. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  156. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  157. const
  158. iconvctlname='libiconvctl';
  159. {$endif}
  160. var
  161. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  162. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  163. threadvar
  164. iconv_ansi2wide,
  165. iconv_wide2ansi : iconv_t;
  166. { since we cache the iconv_t converters, we have to do the same
  167. for the DefaultSystemCodePage variable since if it changes, we
  168. have to re-initialize the converters too. We can't do that via
  169. a callback in the widestring manager because DefaultSystemCodePage
  170. is not a threadvar and we can't automatically change this in all
  171. threads }
  172. current_DefaultSystemCodePage: TSystemCodePage;
  173. {$i winiconv.inc}
  174. procedure InitThread;
  175. var
  176. transliterate: cint;
  177. iconvindex: longint;
  178. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  179. iconvname: rawbytestring;
  180. {$endif}
  181. begin
  182. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  183. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  184. iconvindex:=win2iconv(DefaultSystemCodePage);
  185. if iconvindex<>-1 then
  186. iconvname:=win2iconv_arr[iconvindex].name
  187. else
  188. { default to UTF-8 on Unix platforms }
  189. iconvname:='UTF-8';
  190. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  191. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  192. {$else}
  193. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  194. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  195. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  196. {$endif}
  197. if assigned(iconvctl) and
  198. (iconv_wide2ansi<>iconv_t(-1)) then
  199. begin
  200. transliterate:=1;
  201. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  202. end;
  203. end;
  204. procedure FiniThread;
  205. begin
  206. if (iconv_wide2ansi <> iconv_t(-1)) then
  207. iconv_close(iconv_wide2ansi);
  208. if (iconv_ansi2wide <> iconv_t(-1)) then
  209. iconv_close(iconv_ansi2wide);
  210. end;
  211. {$if defined(beos) and not defined(haiku)}
  212. function nl_langinfo(__item:nl_item):pchar;
  213. begin
  214. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  215. // Now implement the minimum required to correctly initialize WideString support
  216. case __item of
  217. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  218. else
  219. begin
  220. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  221. Result := '';
  222. end;
  223. end;
  224. end;
  225. {$endif}
  226. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
  227. var
  228. iconvindex: longint;
  229. begin
  230. { TODO: add caching (then we also don't need separate code for
  231. the default system page and other ones)
  232. -- typecasting an ansistring function result to pchar is
  233. unsafe normally, but these are constant strings -> no
  234. problem }
  235. open_iconv_for_cps:=iconv_t(-1);
  236. iconvindex:=win2iconv(cp);
  237. if iconvindex=-1 then
  238. exit;
  239. repeat
  240. if cp_is_from then
  241. open_iconv_for_cps:=iconv_open(otherencoding,pchar(win2iconv_arr[iconvindex].name))
  242. else
  243. open_iconv_for_cps:=iconv_open(pchar(win2iconv_arr[iconvindex].name),otherencoding);
  244. inc(iconvindex);
  245. until (open_iconv_for_cps<>iconv_t(-1)) or
  246. (iconvindex>high(win2iconv_arr)) or
  247. (win2iconv_arr[iconvindex].cp<>cp);
  248. end;
  249. {$ifdef aix}
  250. {$i cwstraix.inc}
  251. {$endif aix}
  252. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  253. var
  254. outlength,
  255. outoffset,
  256. srclen,
  257. outleft : size_t;
  258. use_iconv: iconv_t;
  259. srcpos : pwidechar;
  260. destpos: pchar;
  261. mynil : pchar;
  262. my0 : size_t;
  263. err : longint;
  264. transliterate: cint;
  265. free_iconv: boolean;
  266. {$ifdef aix}
  267. intermediate: rawbytestring;
  268. {$endif aix}
  269. begin
  270. {$ifdef aix}
  271. { AIX libiconv does not support converting cp866 to anything else except
  272. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  273. if cp=866 then
  274. begin
  275. Wide2AnsiMove(source,intermediate,28595,len);
  276. if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
  277. exit;
  278. end;
  279. {$endif aix}
  280. if (cp=DefaultSystemCodePage) then
  281. begin
  282. { update iconv converter in case the DefaultSystemCodePage has been
  283. changed }
  284. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  285. begin
  286. FiniThread;
  287. InitThread;
  288. end;
  289. use_iconv:=iconv_wide2ansi;
  290. free_iconv:=false;
  291. end
  292. else
  293. begin
  294. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  295. if (use_iconv<>iconv_t(-1)) and
  296. assigned(iconvctl) then
  297. begin
  298. transliterate:=1;
  299. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  300. end;
  301. free_iconv:=true;
  302. end;
  303. { unsupported encoding -> default move }
  304. if use_iconv=iconv_t(-1) then
  305. begin
  306. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  307. exit;
  308. end;
  309. mynil:=nil;
  310. my0:=0;
  311. { rought estimation }
  312. setlength(dest,len*3);
  313. outlength:=len*3;
  314. srclen:=len*2;
  315. srcpos:=source;
  316. destpos:=pchar(dest);
  317. outleft:=outlength;
  318. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  319. begin
  320. err:=fpgetCerrno;
  321. case err of
  322. { last character is incomplete sequence }
  323. ESysEINVAL,
  324. { incomplete sequence in the middle }
  325. ESysEILSEQ:
  326. begin
  327. { skip and set to '?' }
  328. inc(srcpos);
  329. dec(srclen,2);
  330. destpos^:='?';
  331. inc(destpos);
  332. dec(outleft);
  333. { reset }
  334. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  335. if err=ESysEINVAL then
  336. break;
  337. end;
  338. ESysE2BIG:
  339. begin
  340. outoffset:=destpos-pchar(dest);
  341. { extend }
  342. setlength(dest,outlength+len*3);
  343. inc(outleft,len*3);
  344. inc(outlength,len*3);
  345. { string could have been moved }
  346. destpos:=pchar(dest)+outoffset;
  347. end;
  348. else
  349. runerror(231);
  350. end;
  351. end;
  352. // truncate string
  353. setlength(dest,length(dest)-outleft);
  354. SetCodePage(dest,cp,false);
  355. if free_iconv then
  356. iconv_close(use_iconv);
  357. end;
  358. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  359. var
  360. outlength,
  361. outoffset,
  362. outleft : size_t;
  363. use_iconv: iconv_t;
  364. srcpos,
  365. destpos: pchar;
  366. mynil : pchar;
  367. my0 : size_t;
  368. err: cint;
  369. iconvindex: longint;
  370. free_iconv: boolean;
  371. {$ifdef aix}
  372. intermediate: rawbytestring;
  373. {$endif aix}
  374. begin
  375. {$ifdef aix}
  376. { AIX libiconv does not support converting cp866 to anything else except
  377. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  378. if cp=866 then
  379. begin
  380. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  381. source:=pchar(intermediate);
  382. end;
  383. {$endif aix}
  384. if (cp=DefaultSystemCodePage) then
  385. begin
  386. { update iconv converter in case the DefaultSystemCodePage has been
  387. changed }
  388. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  389. begin
  390. FiniThread;
  391. InitThread;
  392. end;
  393. use_iconv:=iconv_ansi2wide;
  394. free_iconv:=false;
  395. end
  396. else
  397. begin
  398. { TODO: add caching (then we also don't need separate code for
  399. the default system page and other ones)
  400. -- typecasting an ansistring function result to pchar is
  401. unsafe normally, but these are constant strings -> no
  402. problem }
  403. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  404. free_iconv:=true;
  405. end;
  406. { unsupported encoding -> default move }
  407. if use_iconv=iconv_t(-1) then
  408. begin
  409. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  410. exit;
  411. end;
  412. mynil:=nil;
  413. my0:=0;
  414. // extra space
  415. outlength:=len+1;
  416. setlength(dest,outlength);
  417. srcpos:=source;
  418. destpos:=pchar(dest);
  419. outleft:=outlength*2;
  420. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  421. begin
  422. err:=fpgetCerrno;
  423. case err of
  424. ESysEINVAL,
  425. ESysEILSEQ:
  426. begin
  427. { skip and set to '?' }
  428. inc(srcpos);
  429. dec(len);
  430. pwidechar(destpos)^:='?';
  431. inc(destpos,2);
  432. dec(outleft,2);
  433. { reset }
  434. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  435. if err=ESysEINVAL then
  436. break;
  437. end;
  438. ESysE2BIG:
  439. begin
  440. outoffset:=destpos-pchar(dest);
  441. { extend }
  442. setlength(dest,outlength+len);
  443. inc(outleft,len*2);
  444. inc(outlength,len);
  445. { string could have been moved }
  446. destpos:=pchar(dest)+outoffset;
  447. end;
  448. else
  449. runerror(231);
  450. end;
  451. end;
  452. // truncate string
  453. setlength(dest,length(dest)-outleft div 2);
  454. if free_iconv then
  455. iconv_close(use_iconv);
  456. end;
  457. function LowerWideString(const s : WideString) : WideString;
  458. var
  459. i : SizeInt;
  460. begin
  461. SetLength(result,length(s));
  462. for i:=0 to length(s)-1 do
  463. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  464. end;
  465. function UpperWideString(const s : WideString) : WideString;
  466. var
  467. i : SizeInt;
  468. begin
  469. SetLength(result,length(s));
  470. for i:=0 to length(s)-1 do
  471. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  472. end;
  473. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  474. begin
  475. if (len>length(s)) then
  476. if (length(s) < 10*256) then
  477. setlength(s,length(s)+10)
  478. else
  479. setlength(s,length(s)+length(s) shr 8);
  480. end;
  481. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  482. begin
  483. EnsureAnsiLen(s,index);
  484. pchar(@s[index])^:=c;
  485. inc(index);
  486. end;
  487. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  488. {$ifndef beos}
  489. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  490. {$else not beos}
  491. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  492. {$endif beos}
  493. var
  494. p : pchar;
  495. mblen : size_t;
  496. begin
  497. { we know that s is unique -> avoid uniquestring calls}
  498. p:=@s[index];
  499. if (nc<=127) then
  500. ConcatCharToAnsiStr(char(nc),s,index)
  501. else
  502. begin
  503. EnsureAnsiLen(s,index+MB_CUR_MAX);
  504. {$ifndef beos}
  505. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  506. {$else not beos}
  507. mblen:=wctomb(p,wchar_t(nc));
  508. {$endif not beos}
  509. if (mblen<>size_t(-1)) then
  510. inc(index,mblen)
  511. else
  512. begin
  513. { invalid wide char }
  514. p^:='?';
  515. inc(index);
  516. end;
  517. end;
  518. end;
  519. function LowerAnsiString(const s : AnsiString) : AnsiString;
  520. var
  521. i, slen,
  522. resindex : SizeInt;
  523. mblen : size_t;
  524. {$ifndef beos}
  525. ombstate,
  526. nmbstate : mbstate_t;
  527. {$endif beos}
  528. wc : wchar_t;
  529. begin
  530. {$ifndef beos}
  531. fillchar(ombstate,sizeof(ombstate),0);
  532. fillchar(nmbstate,sizeof(nmbstate),0);
  533. {$endif beos}
  534. slen:=length(s);
  535. SetLength(result,slen+10);
  536. i:=1;
  537. resindex:=1;
  538. while (i<=slen) do
  539. begin
  540. if (s[i]<=#127) then
  541. begin
  542. wc:=wchar_t(s[i]);
  543. mblen:= 1;
  544. end
  545. else
  546. {$ifndef beos}
  547. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  548. {$else not beos}
  549. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  550. {$endif not beos}
  551. case mblen of
  552. size_t(-2):
  553. begin
  554. { partial invalid character, copy literally }
  555. while (i<=slen) do
  556. begin
  557. ConcatCharToAnsiStr(s[i],result,resindex);
  558. inc(i);
  559. end;
  560. end;
  561. size_t(-1), 0:
  562. begin
  563. { invalid or null character }
  564. ConcatCharToAnsiStr(s[i],result,resindex);
  565. inc(i);
  566. end;
  567. else
  568. begin
  569. { a valid sequence }
  570. { even if mblen = 1, the lowercase version may have a }
  571. { different length }
  572. { We can't do anything special if wchar_t is 16 bit... }
  573. {$ifndef beos}
  574. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  575. {$else not beos}
  576. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  577. {$endif not beos}
  578. inc(i,mblen);
  579. end;
  580. end;
  581. end;
  582. SetLength(result,resindex-1);
  583. end;
  584. function UpperAnsiString(const s : AnsiString) : AnsiString;
  585. var
  586. i, slen,
  587. resindex : SizeInt;
  588. mblen : size_t;
  589. {$ifndef beos}
  590. ombstate,
  591. nmbstate : mbstate_t;
  592. {$endif beos}
  593. wc : wchar_t;
  594. begin
  595. {$ifndef beos}
  596. fillchar(ombstate,sizeof(ombstate),0);
  597. fillchar(nmbstate,sizeof(nmbstate),0);
  598. {$endif beos}
  599. slen:=length(s);
  600. SetLength(result,slen+10);
  601. i:=1;
  602. resindex:=1;
  603. while (i<=slen) do
  604. begin
  605. if (s[i]<=#127) then
  606. begin
  607. wc:=wchar_t(s[i]);
  608. mblen:= 1;
  609. end
  610. else
  611. {$ifndef beos}
  612. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  613. {$else not beos}
  614. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  615. {$endif beos}
  616. case mblen of
  617. size_t(-2):
  618. begin
  619. { partial invalid character, copy literally }
  620. while (i<=slen) do
  621. begin
  622. ConcatCharToAnsiStr(s[i],result,resindex);
  623. inc(i);
  624. end;
  625. end;
  626. size_t(-1), 0:
  627. begin
  628. { invalid or null character }
  629. ConcatCharToAnsiStr(s[i],result,resindex);
  630. inc(i);
  631. end;
  632. else
  633. begin
  634. { a valid sequence }
  635. { even if mblen = 1, the uppercase version may have a }
  636. { different length }
  637. { We can't do anything special if wchar_t is 16 bit... }
  638. {$ifndef beos}
  639. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  640. {$else not beos}
  641. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  642. {$endif not beos}
  643. inc(i,mblen);
  644. end;
  645. end;
  646. end;
  647. SetLength(result,resindex-1);
  648. end;
  649. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  650. var
  651. i, slen,
  652. destindex : SizeInt;
  653. uch : UCS4Char;
  654. begin
  655. slen:=length(s);
  656. setlength(result,slen+1);
  657. i:=1;
  658. destindex:=0;
  659. while (i<=slen) do
  660. begin
  661. uch:=UCS4Char(s[i]);
  662. if (uch=0) then
  663. result[destindex]:=32
  664. else if (uch<=$d7ff) or (uch>=$e000) then
  665. result[destindex]:=uch
  666. else if (uch<=$dbff) and
  667. (i<slen) and
  668. (s[i+1]>=#$dc00) and
  669. (s[i+1]<=#$dfff) then
  670. begin
  671. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  672. inc(i);
  673. end
  674. else { invalid surrogate pair }
  675. result[destindex]:=uch;
  676. inc(i);
  677. inc(destindex);
  678. end;
  679. result[destindex]:=UCS4Char(0);
  680. { Trimming length in this particular case is just a waste of time,
  681. because result will be interpreted as null-terminated and discarded
  682. almost immediately }
  683. end;
  684. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  685. {$if not(defined (aix) and defined(cpupowerpc32))}
  686. var
  687. hs1,hs2 : UCS4String;
  688. begin
  689. { wcscoll interprets null chars as end-of-string -> filter out }
  690. hs1:=WideStringToUCS4StringNoNulls(s1);
  691. hs2:=WideStringToUCS4StringNoNulls(s2);
  692. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  693. end;
  694. {$else}
  695. { AIX/PPC32 has a 16 bit wchar_t }
  696. var
  697. i, len: longint;
  698. hs1, hs2: array of widechar;
  699. begin
  700. len:=length(s1);
  701. setlength(hs1,len+1);
  702. for i:=1 to len do
  703. if s1[i]<>#0 then
  704. hs1[i-1]:=s1[i]
  705. else
  706. hs1[i-1]:=#32;
  707. hs1[len]:=#0;
  708. len:=length(s2);
  709. setlength(hs2,len+1);
  710. for i:=1 to len do
  711. if s2[i]<>#0 then
  712. hs2[i-1]:=s2[i]
  713. else
  714. hs2[i-1]:=#32;
  715. hs2[len]:=#0;
  716. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  717. end;
  718. {$endif}
  719. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  720. begin
  721. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  722. end;
  723. function CharLengthPChar(const Str: PChar): PtrInt;
  724. var
  725. nextlen: ptrint;
  726. s: pchar;
  727. {$ifndef beos}
  728. mbstate: mbstate_t;
  729. {$endif not beos}
  730. begin
  731. result:=0;
  732. s:=str;
  733. {$ifndef beos}
  734. fillchar(mbstate,sizeof(mbstate),0);
  735. {$endif not beos}
  736. repeat
  737. {$ifdef beos}
  738. nextlen:=ptrint(mblen(str,MB_CUR_MAX));
  739. {$else beos}
  740. nextlen:=ptrint(mbrlen(str,MB_CUR_MAX,@mbstate));
  741. {$endif beos}
  742. { skip invalid/incomplete sequences }
  743. if (nextlen<0) then
  744. nextlen:=1;
  745. inc(result,nextlen);
  746. inc(s,nextlen);
  747. until (nextlen=0);
  748. end;
  749. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  750. var
  751. {$ifndef beos}
  752. mbstate: mbstate_t;
  753. {$endif not beos}
  754. begin
  755. {$ifdef beos}
  756. result:=ptrint(mblen(str,maxlookahead));
  757. {$else beos}
  758. fillchar(mbstate,sizeof(mbstate),0);
  759. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  760. { mbrlen can also return -2 for "incomplete but potially valid character
  761. and data has been processed" }
  762. if result<0 then
  763. result:=-1;
  764. {$endif beos}
  765. end;
  766. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  767. var
  768. a,b: pchar;
  769. i: PtrInt;
  770. begin
  771. if not(canmodifys1) then
  772. getmem(a,len1+1)
  773. else
  774. a:=s1;
  775. for i:=0 to len1-1 do
  776. if s1[i]<>#0 then
  777. a[i]:=s1[i]
  778. else
  779. a[i]:=#32;
  780. a[len1]:=#0;
  781. if not(canmodifys2) then
  782. getmem(b,len2+1)
  783. else
  784. b:=s2;
  785. for i:=0 to len2-1 do
  786. if s2[i]<>#0 then
  787. b[i]:=s2[i]
  788. else
  789. b[i]:=#32;
  790. b[len2]:=#0;
  791. result:=strcoll(a,b);
  792. if not(canmodifys1) then
  793. freemem(a);
  794. if not(canmodifys2) then
  795. freemem(b);
  796. end;
  797. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  798. begin
  799. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  800. end;
  801. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  802. begin
  803. result:=strcoll(s1,s2);
  804. end;
  805. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  806. var
  807. a, b: AnsiString;
  808. begin
  809. a:=UpperAnsistring(s1);
  810. b:=UpperAnsistring(s2);
  811. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  812. end;
  813. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  814. begin
  815. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  816. end;
  817. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  818. var
  819. a, b: pchar;
  820. begin
  821. if (maxlen=0) then
  822. exit(0);
  823. if (s1[maxlen]<>#0) then
  824. begin
  825. getmem(a,maxlen+1);
  826. move(s1^,a^,maxlen);
  827. a[maxlen]:=#0;
  828. end
  829. else
  830. a:=s1;
  831. if (s2[maxlen]<>#0) then
  832. begin
  833. getmem(b,maxlen+1);
  834. move(s2^,b^,maxlen);
  835. b[maxlen]:=#0;
  836. end
  837. else
  838. b:=s2;
  839. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  840. if (a<>s1) then
  841. freemem(a);
  842. if (b<>s2) then
  843. freemem(b);
  844. end;
  845. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  846. var
  847. a, b: ansistring;
  848. begin
  849. if (maxlen=0) then
  850. exit(0);
  851. setlength(a,maxlen);
  852. move(s1^,a[1],maxlen);
  853. setlength(b,maxlen);
  854. move(s2^,b[1],maxlen);
  855. result:=AnsiCompareText(a,b);
  856. end;
  857. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  858. var
  859. newlen: sizeint;
  860. begin
  861. newlen:=length(s);
  862. if newlen>strlen(orgp) then
  863. fpc_rangeerror;
  864. p:=orgp;
  865. if (newlen>0) then
  866. move(s[1],p[0],newlen);
  867. p[newlen]:=#0;
  868. end;
  869. function AnsiStrLower(Str: PChar): PChar;
  870. var
  871. temp: ansistring;
  872. begin
  873. temp:=loweransistring(str);
  874. ansi2pchar(temp,str,result);
  875. end;
  876. function AnsiStrUpper(Str: PChar): PChar;
  877. var
  878. temp: ansistring;
  879. begin
  880. temp:=upperansistring(str);
  881. ansi2pchar(temp,str,result);
  882. end;
  883. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  884. var
  885. langinfo: pchar;
  886. begin
  887. langinfo:=nl_langinfo(CODESET);
  888. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  889. that causes it to return an empty string of UTF-8 locales
  890. -> patch up (and in general, UTF-8 is a good default on
  891. Unix platforms) }
  892. if not assigned(langinfo) or
  893. (langinfo^=#0) then
  894. langinfo:='UTF-8';
  895. Result := iconv2win(ansistring(langinfo));
  896. end;
  897. {$ifdef FPC_HAS_CPSTRING}
  898. {$i textrec.inc}
  899. procedure SetStdIOCodePage(var T: Text); inline;
  900. begin
  901. case TextRec(T).Mode of
  902. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  903. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  904. end;
  905. end;
  906. procedure SetStdIOCodePages; inline;
  907. begin
  908. SetStdIOCodePage(Input);
  909. SetStdIOCodePage(Output);
  910. SetStdIOCodePage(ErrOutput);
  911. SetStdIOCodePage(StdOut);
  912. SetStdIOCodePage(StdErr);
  913. end;
  914. {$endif FPC_HAS_CPSTRING}
  915. Procedure SetCWideStringManager;
  916. Var
  917. CWideStringManager : TUnicodeStringManager;
  918. begin
  919. CWideStringManager:=widestringmanager;
  920. With CWideStringManager do
  921. begin
  922. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  923. Ansi2WideMoveProc:=@Ansi2WideMove;
  924. UpperWideStringProc:=@UpperWideString;
  925. LowerWideStringProc:=@LowerWideString;
  926. CompareWideStringProc:=@CompareWideString;
  927. CompareTextWideStringProc:=@CompareTextWideString;
  928. CharLengthPCharProc:=@CharLengthPChar;
  929. CodePointLengthProc:=@CodePointLength;
  930. UpperAnsiStringProc:=@UpperAnsiString;
  931. LowerAnsiStringProc:=@LowerAnsiString;
  932. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  933. CompareTextAnsiStringProc:=@AnsiCompareText;
  934. StrCompAnsiStringProc:=@StrCompAnsi;
  935. StrICompAnsiStringProc:=@AnsiStrIComp;
  936. StrLCompAnsiStringProc:=@AnsiStrLComp;
  937. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  938. StrLowerAnsiStringProc:=@AnsiStrLower;
  939. StrUpperAnsiStringProc:=@AnsiStrUpper;
  940. ThreadInitProc:=@InitThread;
  941. ThreadFiniProc:=@FiniThread;
  942. { Unicode }
  943. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  944. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  945. UpperUnicodeStringProc:=@UpperWideString;
  946. LowerUnicodeStringProc:=@LowerWideString;
  947. CompareUnicodeStringProc:=@CompareWideString;
  948. CompareTextUnicodeStringProc:=@CompareTextWideString;
  949. { CodePage }
  950. GetStandardCodePageProc:=@GetStandardCodePage;
  951. end;
  952. SetUnicodeStringManager(CWideStringManager);
  953. end;
  954. var
  955. iconvlib:TLibHandle;
  956. initialization
  957. SetCWideStringManager;
  958. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  959. { with the information from the environment variables according to POSIX }
  960. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  961. setlocale(LC_ALL,'');
  962. { load iconvctl function }
  963. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  964. if iconvlib<>0 then
  965. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  966. { set the DefaultSystemCodePage }
  967. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  968. {$ifdef FPC_HAS_CPSTRING}
  969. SetStdIOCodePages;
  970. {$endif FPC_HAS_CPSTRING}
  971. { init conversion tables for main program }
  972. InitThread;
  973. finalization
  974. { fini conversion tables for main program }
  975. FiniThread;
  976. { unload iconv library }
  977. if iconvlib<>0 then
  978. FreeLibrary(iconvlib);
  979. end.