cwstring.pp 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. {$if not defined(linux) and not defined(solaris)} // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(haiku)}
  22. {$linklib textencoding}
  23. {$linklib locale}
  24. {$else}
  25. {$linklib iconv}
  26. {$endif}
  27. {$define useiconv}
  28. {$endif linux}
  29. Uses
  30. BaseUnix,
  31. ctypes,
  32. unix,
  33. unixtype,
  34. initc,
  35. dynlibs;
  36. Const
  37. {$ifndef useiconv}
  38. libiconvname='c'; // is in libc under Linux.
  39. libprefix='lib';
  40. {$else}
  41. {$ifdef haiku}
  42. libiconvname='textencoding'; // is in libtextencoding under Haiku
  43. libprefix='lib';
  44. {$else}
  45. {$if defined(darwin) or defined(aix)}
  46. libiconvname='libiconv';
  47. libprefix='';
  48. {$else}
  49. libiconvname='iconv';
  50. libprefix='lib';
  51. {$endif}
  52. {$endif}
  53. {$endif}
  54. { helper functions from libc }
  55. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  56. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  57. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  58. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  59. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  60. {$ifndef beos}
  61. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  62. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  63. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  64. {$else beos}
  65. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  66. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  67. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  68. {$endif beos}
  69. const
  70. {$if defined(linux)}
  71. __LC_CTYPE = 0;
  72. LC_ALL = 6;
  73. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  74. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  75. CODESET = _NL_CTYPE_CODESET_NAME;
  76. {$elseif defined(darwin)}
  77. CODESET = 0;
  78. LC_ALL = 0;
  79. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  80. __LC_CTYPE = 0;
  81. LC_ALL = 0;
  82. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  83. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  84. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  85. {$elseif defined(solaris)}
  86. {$define ACCEPT_646}
  87. CODESET=49;
  88. LC_ALL = 6;
  89. {$elseif defined(beos)}
  90. {$warning check correct value for BeOS}
  91. CODESET=49;
  92. {$ifdef haiku}
  93. LC_ALL = 0; // Checked for Haiku
  94. {$else}
  95. LC_ALL = 6; // Checked for BeOS
  96. {$endif}
  97. ESysEILSEQ = EILSEQ;
  98. {$elseif defined(OpenBSD)}
  99. {$define ACCEPT_646}
  100. CODESET = 51;
  101. LC_ALL = 0;
  102. {$elseif defined(NetBSD)}
  103. {$define ACCEPT_646}
  104. CODESET = 51;
  105. LC_ALL = 0;
  106. {$elseif defined(aix)}
  107. CODESET = 49;
  108. LC_ALL = -1;
  109. {$else not aix}
  110. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  111. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  112. {$endif}
  113. { unicode encoding name }
  114. {$ifdef FPC_LITTLE_ENDIAN}
  115. unicode_encoding2 = 'UTF-16LE';
  116. unicode_encoding4 = 'UCS-4LE';
  117. {$else FPC_LITTLE_ENDIAN}
  118. {$ifdef AIX}
  119. unicode_encoding2 = 'UTF-16';
  120. {$else AIX}
  121. unicode_encoding2 = 'UTF-16BE';
  122. unicode_encoding4 = 'UCS-4BE';
  123. {$endif AIX}
  124. {$endif FPC_LITTLE_ENDIAN}
  125. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  126. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  127. { that's a libc macro mapped to internal functions/variables }
  128. { and thus not a stable external API on systems where libc }
  129. { breaks backwards compatibility every now and then }
  130. MB_CUR_MAX = 10;
  131. { Requests for iconvctl }
  132. ICONV_TRIVIALP = 0; // int *argument
  133. ICONV_GET_TRANSLITERATE = 1; // int *argument
  134. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  135. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  136. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  137. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  138. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  139. type
  140. piconv_t = ^iconv_t;
  141. iconv_t = pointer;
  142. nl_item = cint;
  143. {$ifdef haiku}
  144. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'locale' name 'nl_langinfo';
  145. {$else}
  146. {$ifndef beos}
  147. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  148. {$endif}
  149. {$endif}
  150. {$if (not defined(bsd) and not defined(beos)) or (defined(darwin) and not defined(cpupowerpc32))}
  151. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  152. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  153. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  154. const
  155. iconvctlname='iconvctl';
  156. {$else}
  157. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  158. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  159. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  160. const
  161. iconvctlname='libiconvctl';
  162. {$endif}
  163. var
  164. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  165. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  166. threadvar
  167. iconv_ansi2wide,
  168. iconv_wide2ansi : iconv_t;
  169. { since we cache the iconv_t converters, we have to do the same
  170. for the DefaultSystemCodePage variable since if it changes, we
  171. have to re-initialize the converters too. We can't do that via
  172. a callback in the widestring manager because DefaultSystemCodePage
  173. is not a threadvar and we can't automatically change this in all
  174. threads }
  175. current_DefaultSystemCodePage: TSystemCodePage;
  176. {$i winiconv.inc}
  177. procedure InitThread;
  178. var
  179. transliterate: cint;
  180. iconvindex: longint;
  181. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  182. iconvname: rawbytestring;
  183. {$endif}
  184. begin
  185. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  186. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  187. iconvindex:=win2iconv(DefaultSystemCodePage);
  188. if iconvindex<>-1 then
  189. iconvname:=win2iconv_arr[iconvindex].name
  190. else
  191. { default to UTF-8 on Unix platforms }
  192. iconvname:='UTF-8';
  193. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  194. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  195. {$else}
  196. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  197. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  198. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  199. {$endif}
  200. if assigned(iconvctl) and
  201. (iconv_wide2ansi<>iconv_t(-1)) then
  202. begin
  203. transliterate:=1;
  204. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  205. end;
  206. end;
  207. procedure FiniThread;
  208. begin
  209. if (iconv_wide2ansi <> iconv_t(-1)) then
  210. iconv_close(iconv_wide2ansi);
  211. if (iconv_ansi2wide <> iconv_t(-1)) then
  212. iconv_close(iconv_ansi2wide);
  213. end;
  214. {$if defined(beos) and not defined(haiku)}
  215. function nl_langinfo(__item:nl_item):pchar;
  216. begin
  217. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  218. // Now implement the minimum required to correctly initialize WideString support
  219. case __item of
  220. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  221. else
  222. begin
  223. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  224. Result := '';
  225. end;
  226. end;
  227. end;
  228. {$endif}
  229. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
  230. var
  231. iconvindex: longint;
  232. begin
  233. { TODO: add caching (then we also don't need separate code for
  234. the default system page and other ones)
  235. -- typecasting an ansistring function result to pchar is
  236. unsafe normally, but these are constant strings -> no
  237. problem }
  238. open_iconv_for_cps:=iconv_t(-1);
  239. iconvindex:=win2iconv(cp);
  240. if iconvindex=-1 then
  241. exit;
  242. repeat
  243. if cp_is_from then
  244. open_iconv_for_cps:=iconv_open(otherencoding,pchar(win2iconv_arr[iconvindex].name))
  245. else
  246. open_iconv_for_cps:=iconv_open(pchar(win2iconv_arr[iconvindex].name),otherencoding);
  247. inc(iconvindex);
  248. until (open_iconv_for_cps<>iconv_t(-1)) or
  249. (iconvindex>high(win2iconv_arr)) or
  250. (win2iconv_arr[iconvindex].cp<>cp);
  251. end;
  252. {$ifdef aix}
  253. {$i cwstraix.inc}
  254. {$endif aix}
  255. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  256. var
  257. outlength,
  258. outoffset,
  259. srclen,
  260. outleft : size_t;
  261. use_iconv: iconv_t;
  262. srcpos : pwidechar;
  263. destpos: pchar;
  264. mynil : pchar;
  265. my0 : size_t;
  266. err : longint;
  267. transliterate: cint;
  268. free_iconv: boolean;
  269. {$ifdef aix}
  270. intermediate: rawbytestring;
  271. {$endif aix}
  272. begin
  273. {$ifdef aix}
  274. { AIX libiconv does not support converting cp866 to anything else except
  275. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  276. if cp=866 then
  277. begin
  278. Wide2AnsiMove(source,intermediate,28595,len);
  279. if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
  280. exit;
  281. end;
  282. {$endif aix}
  283. if (cp=DefaultSystemCodePage) then
  284. begin
  285. { update iconv converter in case the DefaultSystemCodePage has been
  286. changed }
  287. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  288. begin
  289. FiniThread;
  290. InitThread;
  291. end;
  292. use_iconv:=iconv_wide2ansi;
  293. free_iconv:=false;
  294. end
  295. else
  296. begin
  297. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  298. if (use_iconv<>iconv_t(-1)) and
  299. assigned(iconvctl) then
  300. begin
  301. transliterate:=1;
  302. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  303. end;
  304. free_iconv:=true;
  305. end;
  306. { unsupported encoding -> default move }
  307. if use_iconv=iconv_t(-1) then
  308. begin
  309. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  310. exit;
  311. end;
  312. mynil:=nil;
  313. my0:=0;
  314. { rought estimation }
  315. setlength(dest,len*3);
  316. outlength:=len*3;
  317. srclen:=len*2;
  318. srcpos:=source;
  319. destpos:=pchar(dest);
  320. outleft:=outlength;
  321. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  322. begin
  323. err:=fpgetCerrno;
  324. case err of
  325. { last character is incomplete sequence }
  326. ESysEINVAL,
  327. { incomplete sequence in the middle }
  328. ESysEILSEQ:
  329. begin
  330. { skip and set to '?' }
  331. inc(srcpos);
  332. dec(srclen,2);
  333. destpos^:='?';
  334. inc(destpos);
  335. dec(outleft);
  336. { reset }
  337. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  338. if err=ESysEINVAL then
  339. break;
  340. end;
  341. ESysE2BIG:
  342. begin
  343. outoffset:=destpos-pchar(dest);
  344. { extend }
  345. setlength(dest,outlength+len*3);
  346. inc(outleft,len*3);
  347. inc(outlength,len*3);
  348. { string could have been moved }
  349. destpos:=pchar(dest)+outoffset;
  350. end;
  351. else
  352. runerror(231);
  353. end;
  354. end;
  355. // truncate string
  356. setlength(dest,length(dest)-outleft);
  357. SetCodePage(dest,cp,false);
  358. if free_iconv then
  359. iconv_close(use_iconv);
  360. end;
  361. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  362. var
  363. outlength,
  364. outoffset,
  365. outleft : size_t;
  366. use_iconv: iconv_t;
  367. srcpos,
  368. destpos: pchar;
  369. mynil : pchar;
  370. my0 : size_t;
  371. err: cint;
  372. iconvindex: longint;
  373. free_iconv: boolean;
  374. {$ifdef aix}
  375. intermediate: rawbytestring;
  376. {$endif aix}
  377. begin
  378. {$ifdef aix}
  379. { AIX libiconv does not support converting cp866 to anything else except
  380. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  381. if cp=866 then
  382. begin
  383. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  384. source:=pchar(intermediate);
  385. end;
  386. {$endif aix}
  387. if (cp=DefaultSystemCodePage) then
  388. begin
  389. { update iconv converter in case the DefaultSystemCodePage has been
  390. changed }
  391. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  392. begin
  393. FiniThread;
  394. InitThread;
  395. end;
  396. use_iconv:=iconv_ansi2wide;
  397. free_iconv:=false;
  398. end
  399. else
  400. begin
  401. { TODO: add caching (then we also don't need separate code for
  402. the default system page and other ones)
  403. -- typecasting an ansistring function result to pchar is
  404. unsafe normally, but these are constant strings -> no
  405. problem }
  406. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  407. free_iconv:=true;
  408. end;
  409. { unsupported encoding -> default move }
  410. if use_iconv=iconv_t(-1) then
  411. begin
  412. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  413. exit;
  414. end;
  415. mynil:=nil;
  416. my0:=0;
  417. // extra space
  418. outlength:=len+1;
  419. setlength(dest,outlength);
  420. srcpos:=source;
  421. destpos:=pchar(dest);
  422. outleft:=outlength*2;
  423. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  424. begin
  425. err:=fpgetCerrno;
  426. case err of
  427. ESysEINVAL,
  428. ESysEILSEQ:
  429. begin
  430. { skip and set to '?' }
  431. inc(srcpos);
  432. dec(len);
  433. pwidechar(destpos)^:='?';
  434. inc(destpos,2);
  435. dec(outleft,2);
  436. { reset }
  437. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  438. if err=ESysEINVAL then
  439. break;
  440. end;
  441. ESysE2BIG:
  442. begin
  443. outoffset:=destpos-pchar(dest);
  444. { extend }
  445. setlength(dest,outlength+len);
  446. inc(outleft,len*2);
  447. inc(outlength,len);
  448. { string could have been moved }
  449. destpos:=pchar(dest)+outoffset;
  450. end;
  451. else
  452. runerror(231);
  453. end;
  454. end;
  455. // truncate string
  456. setlength(dest,length(dest)-outleft div 2);
  457. if free_iconv then
  458. iconv_close(use_iconv);
  459. end;
  460. function LowerWideString(const s : WideString) : WideString;
  461. var
  462. i : SizeInt;
  463. begin
  464. SetLength(result,length(s));
  465. for i:=0 to length(s)-1 do
  466. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  467. end;
  468. function UpperWideString(const s : WideString) : WideString;
  469. var
  470. i : SizeInt;
  471. begin
  472. SetLength(result,length(s));
  473. for i:=0 to length(s)-1 do
  474. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  475. end;
  476. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  477. begin
  478. if (len>length(s)) then
  479. if (length(s) < 10*256) then
  480. setlength(s,length(s)+10)
  481. else
  482. setlength(s,length(s)+length(s) shr 8);
  483. end;
  484. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  485. begin
  486. EnsureAnsiLen(s,index);
  487. pchar(@s[index])^:=c;
  488. inc(index);
  489. end;
  490. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  491. {$ifndef beos}
  492. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  493. {$else not beos}
  494. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  495. {$endif beos}
  496. var
  497. p : pchar;
  498. mblen : size_t;
  499. begin
  500. { we know that s is unique -> avoid uniquestring calls}
  501. p:=@s[index];
  502. if (nc<=127) then
  503. ConcatCharToAnsiStr(char(nc),s,index)
  504. else
  505. begin
  506. EnsureAnsiLen(s,index+MB_CUR_MAX);
  507. {$ifndef beos}
  508. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  509. {$else not beos}
  510. mblen:=wctomb(p,wchar_t(nc));
  511. {$endif not beos}
  512. if (mblen<>size_t(-1)) then
  513. inc(index,mblen)
  514. else
  515. begin
  516. { invalid wide char }
  517. p^:='?';
  518. inc(index);
  519. end;
  520. end;
  521. end;
  522. function LowerAnsiString(const s : AnsiString) : AnsiString;
  523. var
  524. i, slen,
  525. resindex : SizeInt;
  526. mblen : size_t;
  527. {$ifndef beos}
  528. ombstate,
  529. nmbstate : mbstate_t;
  530. {$endif beos}
  531. wc : wchar_t;
  532. begin
  533. {$ifndef beos}
  534. fillchar(ombstate,sizeof(ombstate),0);
  535. fillchar(nmbstate,sizeof(nmbstate),0);
  536. {$endif beos}
  537. slen:=length(s);
  538. SetLength(result,slen+10);
  539. i:=1;
  540. resindex:=1;
  541. while (i<=slen) do
  542. begin
  543. if (s[i]<=#127) then
  544. begin
  545. wc:=wchar_t(s[i]);
  546. mblen:= 1;
  547. end
  548. else
  549. {$ifndef beos}
  550. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  551. {$else not beos}
  552. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  553. {$endif not beos}
  554. case mblen of
  555. size_t(-2):
  556. begin
  557. { partial invalid character, copy literally }
  558. while (i<=slen) do
  559. begin
  560. ConcatCharToAnsiStr(s[i],result,resindex);
  561. inc(i);
  562. end;
  563. end;
  564. size_t(-1), 0:
  565. begin
  566. { invalid or null character }
  567. ConcatCharToAnsiStr(s[i],result,resindex);
  568. inc(i);
  569. end;
  570. else
  571. begin
  572. { a valid sequence }
  573. { even if mblen = 1, the lowercase version may have a }
  574. { different length }
  575. { We can't do anything special if wchar_t is 16 bit... }
  576. {$ifndef beos}
  577. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  578. {$else not beos}
  579. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  580. {$endif not beos}
  581. inc(i,mblen);
  582. end;
  583. end;
  584. end;
  585. SetLength(result,resindex-1);
  586. end;
  587. function UpperAnsiString(const s : AnsiString) : AnsiString;
  588. var
  589. i, slen,
  590. resindex : SizeInt;
  591. mblen : size_t;
  592. {$ifndef beos}
  593. ombstate,
  594. nmbstate : mbstate_t;
  595. {$endif beos}
  596. wc : wchar_t;
  597. begin
  598. {$ifndef beos}
  599. fillchar(ombstate,sizeof(ombstate),0);
  600. fillchar(nmbstate,sizeof(nmbstate),0);
  601. {$endif beos}
  602. slen:=length(s);
  603. SetLength(result,slen+10);
  604. i:=1;
  605. resindex:=1;
  606. while (i<=slen) do
  607. begin
  608. if (s[i]<=#127) then
  609. begin
  610. wc:=wchar_t(s[i]);
  611. mblen:= 1;
  612. end
  613. else
  614. {$ifndef beos}
  615. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  616. {$else not beos}
  617. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  618. {$endif beos}
  619. case mblen of
  620. size_t(-2):
  621. begin
  622. { partial invalid character, copy literally }
  623. while (i<=slen) do
  624. begin
  625. ConcatCharToAnsiStr(s[i],result,resindex);
  626. inc(i);
  627. end;
  628. end;
  629. size_t(-1), 0:
  630. begin
  631. { invalid or null character }
  632. ConcatCharToAnsiStr(s[i],result,resindex);
  633. inc(i);
  634. end;
  635. else
  636. begin
  637. { a valid sequence }
  638. { even if mblen = 1, the uppercase version may have a }
  639. { different length }
  640. { We can't do anything special if wchar_t is 16 bit... }
  641. {$ifndef beos}
  642. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  643. {$else not beos}
  644. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  645. {$endif not beos}
  646. inc(i,mblen);
  647. end;
  648. end;
  649. end;
  650. SetLength(result,resindex-1);
  651. end;
  652. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  653. var
  654. i, slen,
  655. destindex : SizeInt;
  656. uch : UCS4Char;
  657. begin
  658. slen:=length(s);
  659. setlength(result,slen+1);
  660. i:=1;
  661. destindex:=0;
  662. while (i<=slen) do
  663. begin
  664. uch:=UCS4Char(s[i]);
  665. if (uch=0) then
  666. result[destindex]:=32
  667. else if (uch<=$d7ff) or (uch>=$e000) then
  668. result[destindex]:=uch
  669. else if (uch<=$dbff) and
  670. (i<slen) and
  671. (s[i+1]>=#$dc00) and
  672. (s[i+1]<=#$dfff) then
  673. begin
  674. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  675. inc(i);
  676. end
  677. else { invalid surrogate pair }
  678. result[destindex]:=uch;
  679. inc(i);
  680. inc(destindex);
  681. end;
  682. result[destindex]:=UCS4Char(0);
  683. { Trimming length in this particular case is just a waste of time,
  684. because result will be interpreted as null-terminated and discarded
  685. almost immediately }
  686. end;
  687. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  688. {$if not(defined (aix) and defined(cpupowerpc32))}
  689. var
  690. hs1,hs2 : UCS4String;
  691. begin
  692. { wcscoll interprets null chars as end-of-string -> filter out }
  693. hs1:=WideStringToUCS4StringNoNulls(s1);
  694. hs2:=WideStringToUCS4StringNoNulls(s2);
  695. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  696. end;
  697. {$else}
  698. { AIX/PPC32 has a 16 bit wchar_t }
  699. var
  700. i, len: longint;
  701. hs1, hs2: array of widechar;
  702. begin
  703. len:=length(s1);
  704. setlength(hs1,len+1);
  705. for i:=1 to len do
  706. if s1[i]<>#0 then
  707. hs1[i-1]:=s1[i]
  708. else
  709. hs1[i-1]:=#32;
  710. hs1[len]:=#0;
  711. len:=length(s2);
  712. setlength(hs2,len+1);
  713. for i:=1 to len do
  714. if s2[i]<>#0 then
  715. hs2[i-1]:=s2[i]
  716. else
  717. hs2[i-1]:=#32;
  718. hs2[len]:=#0;
  719. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  720. end;
  721. {$endif}
  722. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  723. begin
  724. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  725. end;
  726. function CharLengthPChar(const Str: PChar): PtrInt;
  727. var
  728. nextlen: ptrint;
  729. s: pchar;
  730. {$ifndef beos}
  731. mbstate: mbstate_t;
  732. {$endif not beos}
  733. begin
  734. result:=0;
  735. s:=str;
  736. {$ifndef beos}
  737. fillchar(mbstate,sizeof(mbstate),0);
  738. {$endif not beos}
  739. repeat
  740. {$ifdef beos}
  741. nextlen:=ptrint(mblen(str,MB_CUR_MAX));
  742. {$else beos}
  743. nextlen:=ptrint(mbrlen(str,MB_CUR_MAX,@mbstate));
  744. {$endif beos}
  745. { skip invalid/incomplete sequences }
  746. if (nextlen<0) then
  747. nextlen:=1;
  748. inc(result,nextlen);
  749. inc(s,nextlen);
  750. until (nextlen=0);
  751. end;
  752. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  753. {$ifndef beos}
  754. var
  755. mbstate: mbstate_t;
  756. {$endif not beos}
  757. begin
  758. {$ifdef beos}
  759. result:=ptrint(mblen(str,maxlookahead));
  760. {$else beos}
  761. fillchar(mbstate,sizeof(mbstate),0);
  762. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  763. { mbrlen can also return -2 for "incomplete but potially valid character
  764. and data has been processed" }
  765. if result<0 then
  766. result:=-1;
  767. {$endif beos}
  768. end;
  769. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  770. var
  771. a,b: pchar;
  772. i: PtrInt;
  773. begin
  774. if not(canmodifys1) then
  775. getmem(a,len1+1)
  776. else
  777. a:=s1;
  778. for i:=0 to len1-1 do
  779. if s1[i]<>#0 then
  780. a[i]:=s1[i]
  781. else
  782. a[i]:=#32;
  783. a[len1]:=#0;
  784. if not(canmodifys2) then
  785. getmem(b,len2+1)
  786. else
  787. b:=s2;
  788. for i:=0 to len2-1 do
  789. if s2[i]<>#0 then
  790. b[i]:=s2[i]
  791. else
  792. b[i]:=#32;
  793. b[len2]:=#0;
  794. result:=strcoll(a,b);
  795. if not(canmodifys1) then
  796. freemem(a);
  797. if not(canmodifys2) then
  798. freemem(b);
  799. end;
  800. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  801. begin
  802. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  803. end;
  804. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  805. begin
  806. result:=strcoll(s1,s2);
  807. end;
  808. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  809. var
  810. a, b: AnsiString;
  811. begin
  812. a:=UpperAnsistring(s1);
  813. b:=UpperAnsistring(s2);
  814. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  815. end;
  816. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  817. begin
  818. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  819. end;
  820. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  821. var
  822. a, b: pchar;
  823. begin
  824. if (maxlen=0) then
  825. exit(0);
  826. if (s1[maxlen]<>#0) then
  827. begin
  828. getmem(a,maxlen+1);
  829. move(s1^,a^,maxlen);
  830. a[maxlen]:=#0;
  831. end
  832. else
  833. a:=s1;
  834. if (s2[maxlen]<>#0) then
  835. begin
  836. getmem(b,maxlen+1);
  837. move(s2^,b^,maxlen);
  838. b[maxlen]:=#0;
  839. end
  840. else
  841. b:=s2;
  842. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  843. if (a<>s1) then
  844. freemem(a);
  845. if (b<>s2) then
  846. freemem(b);
  847. end;
  848. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  849. var
  850. a, b: ansistring;
  851. begin
  852. if (maxlen=0) then
  853. exit(0);
  854. setlength(a,maxlen);
  855. move(s1^,a[1],maxlen);
  856. setlength(b,maxlen);
  857. move(s2^,b[1],maxlen);
  858. result:=AnsiCompareText(a,b);
  859. end;
  860. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  861. var
  862. newlen: sizeint;
  863. begin
  864. newlen:=length(s);
  865. if newlen>strlen(orgp) then
  866. fpc_rangeerror;
  867. p:=orgp;
  868. if (newlen>0) then
  869. move(s[1],p[0],newlen);
  870. p[newlen]:=#0;
  871. end;
  872. function AnsiStrLower(Str: PChar): PChar;
  873. var
  874. temp: ansistring;
  875. begin
  876. temp:=loweransistring(str);
  877. ansi2pchar(temp,str,result);
  878. end;
  879. function AnsiStrUpper(Str: PChar): PChar;
  880. var
  881. temp: ansistring;
  882. begin
  883. temp:=upperansistring(str);
  884. ansi2pchar(temp,str,result);
  885. end;
  886. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  887. var
  888. langinfo: pchar;
  889. begin
  890. langinfo:=nl_langinfo(CODESET);
  891. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  892. that causes it to return an empty string of UTF-8 locales
  893. -> patch up (and in general, UTF-8 is a good default on
  894. Unix platforms) }
  895. if not assigned(langinfo) or
  896. (langinfo^=#0) then
  897. langinfo:='UTF-8';
  898. Result := iconv2win(ansistring(langinfo));
  899. end;
  900. {$ifdef FPC_HAS_CPSTRING}
  901. {$i textrec.inc}
  902. procedure SetStdIOCodePage(var T: Text); inline;
  903. begin
  904. case TextRec(T).Mode of
  905. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  906. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  907. end;
  908. end;
  909. procedure SetStdIOCodePages; inline;
  910. begin
  911. SetStdIOCodePage(Input);
  912. SetStdIOCodePage(Output);
  913. SetStdIOCodePage(ErrOutput);
  914. SetStdIOCodePage(StdOut);
  915. SetStdIOCodePage(StdErr);
  916. end;
  917. {$endif FPC_HAS_CPSTRING}
  918. Procedure SetCWideStringManager;
  919. Var
  920. CWideStringManager : TUnicodeStringManager;
  921. begin
  922. CWideStringManager:=widestringmanager;
  923. With CWideStringManager do
  924. begin
  925. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  926. Ansi2WideMoveProc:=@Ansi2WideMove;
  927. UpperWideStringProc:=@UpperWideString;
  928. LowerWideStringProc:=@LowerWideString;
  929. CompareWideStringProc:=@CompareWideString;
  930. CompareTextWideStringProc:=@CompareTextWideString;
  931. CharLengthPCharProc:=@CharLengthPChar;
  932. CodePointLengthProc:=@CodePointLength;
  933. UpperAnsiStringProc:=@UpperAnsiString;
  934. LowerAnsiStringProc:=@LowerAnsiString;
  935. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  936. CompareTextAnsiStringProc:=@AnsiCompareText;
  937. StrCompAnsiStringProc:=@StrCompAnsi;
  938. StrICompAnsiStringProc:=@AnsiStrIComp;
  939. StrLCompAnsiStringProc:=@AnsiStrLComp;
  940. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  941. StrLowerAnsiStringProc:=@AnsiStrLower;
  942. StrUpperAnsiStringProc:=@AnsiStrUpper;
  943. ThreadInitProc:=@InitThread;
  944. ThreadFiniProc:=@FiniThread;
  945. { Unicode }
  946. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  947. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  948. UpperUnicodeStringProc:=@UpperWideString;
  949. LowerUnicodeStringProc:=@LowerWideString;
  950. CompareUnicodeStringProc:=@CompareWideString;
  951. CompareTextUnicodeStringProc:=@CompareTextWideString;
  952. { CodePage }
  953. GetStandardCodePageProc:=@GetStandardCodePage;
  954. end;
  955. SetUnicodeStringManager(CWideStringManager);
  956. end;
  957. var
  958. iconvlib:TLibHandle;
  959. initialization
  960. SetCWideStringManager;
  961. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  962. { with the information from the environment variables according to POSIX }
  963. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  964. setlocale(LC_ALL,'');
  965. { load iconvctl function }
  966. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  967. if iconvlib<>0 then
  968. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  969. { set the DefaultSystemCodePage }
  970. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  971. {$ifdef FPC_HAS_CPSTRING}
  972. SetStdIOCodePages;
  973. {$endif FPC_HAS_CPSTRING}
  974. { init conversion tables for main program }
  975. InitThread;
  976. finalization
  977. { fini conversion tables for main program }
  978. FiniThread;
  979. { unload iconv library }
  980. if iconvlib<>0 then
  981. FreeLibrary(iconvlib);
  982. end.