cwstring.pp 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. {$if not defined(linux) and not defined(solaris)} // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(haiku)}
  22. {$linklib textencoding}
  23. {$linklib locale}
  24. {$else}
  25. {$linklib iconv}
  26. {$endif}
  27. {$define useiconv}
  28. {$endif linux}
  29. Uses
  30. BaseUnix,
  31. ctypes,
  32. unix,
  33. unixtype,
  34. initc,
  35. dynlibs;
  36. Const
  37. {$ifndef useiconv}
  38. libiconvname='c'; // is in libc under Linux.
  39. libprefix='lib';
  40. {$else}
  41. {$ifdef haiku}
  42. libiconvname='textencoding'; // is in libtextencoding under Haiku
  43. libprefix='lib';
  44. {$else}
  45. {$if defined(darwin) or defined(aix)}
  46. libiconvname='libiconv';
  47. libprefix='';
  48. {$else}
  49. libiconvname='iconv';
  50. libprefix='lib';
  51. {$endif}
  52. {$endif}
  53. {$endif}
  54. { helper functions from libc }
  55. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  56. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  57. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  58. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  59. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  60. {$ifndef beos}
  61. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  62. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  63. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  64. {$else beos}
  65. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  66. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  67. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  68. {$endif beos}
  69. const
  70. {$if defined(linux)}
  71. __LC_CTYPE = 0;
  72. LC_ALL = 6;
  73. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  74. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  75. CODESET = _NL_CTYPE_CODESET_NAME;
  76. {$elseif defined(darwin)}
  77. CODESET = 0;
  78. LC_ALL = 0;
  79. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  80. __LC_CTYPE = 0;
  81. LC_ALL = 0;
  82. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  83. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  84. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  85. {$elseif defined(solaris)}
  86. CODESET=49;
  87. LC_ALL = 6;
  88. {$elseif defined(beos)}
  89. {$warning check correct value for BeOS}
  90. CODESET=49;
  91. {$ifdef haiku}
  92. LC_ALL = 0; // Checked for Haiku
  93. {$else}
  94. LC_ALL = 6; // Checked for BeOS
  95. {$endif}
  96. ESysEILSEQ = EILSEQ;
  97. {$elseif defined(OpenBSD)}
  98. CODESET = 51;
  99. LC_ALL = 0;
  100. {$elseif defined(NetBSD)}
  101. CODESET = 51;
  102. LC_ALL = 0;
  103. {$elseif defined(aix)}
  104. CODESET = 49;
  105. LC_ALL = -1;
  106. {$else not aix}
  107. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  108. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  109. {$endif}
  110. { unicode encoding name }
  111. {$ifdef FPC_LITTLE_ENDIAN}
  112. unicode_encoding2 = 'UTF-16LE';
  113. unicode_encoding4 = 'UCS-4LE';
  114. {$else FPC_LITTLE_ENDIAN}
  115. unicode_encoding2 = 'UTF-16BE';
  116. unicode_encoding4 = 'UCS-4BE';
  117. {$endif FPC_LITTLE_ENDIAN}
  118. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  119. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  120. { that's a libc macro mapped to internal functions/variables }
  121. { and thus not a stable external API on systems where libc }
  122. { breaks backwards compatibility every now and then }
  123. MB_CUR_MAX = 10;
  124. { Requests for iconvctl }
  125. ICONV_TRIVIALP = 0; // int *argument
  126. ICONV_GET_TRANSLITERATE = 1; // int *argument
  127. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  128. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  129. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  130. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  131. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  132. type
  133. piconv_t = ^iconv_t;
  134. iconv_t = pointer;
  135. nl_item = cint;
  136. {$ifdef haiku}
  137. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'locale' name 'nl_langinfo';
  138. {$else}
  139. {$ifndef beos}
  140. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  141. {$endif}
  142. {$endif}
  143. {$if (not defined(bsd) and not defined(beos)) or (defined(darwin) and not defined(cpupowerpc32))}
  144. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  145. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  146. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  147. const
  148. iconvctlname='iconvctl';
  149. {$else}
  150. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  151. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  152. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  153. const
  154. iconvctlname='libiconvctl';
  155. {$endif}
  156. var
  157. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  158. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  159. threadvar
  160. iconv_ansi2wide,
  161. iconv_wide2ansi : iconv_t;
  162. { since we cache the iconv_t converters, we have to do the same
  163. for the DefaultSystemCodePage variable since if it changes, we
  164. have to re-initialize the converters too. We can't do that via
  165. a callback in the widestring manager because DefaultSystemCodePage
  166. is not a threadvar and we can't automatically change this in all
  167. threads }
  168. current_DefaultSystemCodePage: TSystemCodePage;
  169. function win2iconv(cp: word): rawbytestring; forward;
  170. procedure InitThread;
  171. var
  172. transliterate: cint;
  173. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  174. iconvname: rawbytestring;
  175. {$endif}
  176. begin
  177. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  178. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  179. iconvname:=win2iconv(DefaultSystemCodePage);
  180. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  181. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  182. {$else}
  183. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  184. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  185. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  186. {$endif}
  187. if assigned(iconvctl) and
  188. (iconv_wide2ansi<>iconv_t(-1)) then
  189. begin
  190. transliterate:=1;
  191. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  192. end;
  193. end;
  194. procedure FiniThread;
  195. begin
  196. if (iconv_wide2ansi <> iconv_t(-1)) then
  197. iconv_close(iconv_wide2ansi);
  198. if (iconv_ansi2wide <> iconv_t(-1)) then
  199. iconv_close(iconv_ansi2wide);
  200. end;
  201. {$i winiconv.inc}
  202. {$if defined(beos) and not defined(haiku)}
  203. function nl_langinfo(__item:nl_item):pchar;
  204. begin
  205. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  206. // Now implement the minimum required to correctly initialize WideString support
  207. case __item of
  208. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  209. else
  210. begin
  211. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  212. Result := '';
  213. end;
  214. end;
  215. end;
  216. {$endif}
  217. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  218. var
  219. outlength,
  220. outoffset,
  221. srclen,
  222. outleft : size_t;
  223. use_iconv: iconv_t;
  224. srcpos : pwidechar;
  225. destpos: pchar;
  226. mynil : pchar;
  227. my0 : size_t;
  228. err,
  229. transliterate: cint;
  230. free_iconv: boolean;
  231. begin
  232. if (cp=DefaultSystemCodePage) then
  233. begin
  234. { update iconv converter in case the DefaultSystemCodePage has been
  235. changed }
  236. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  237. begin
  238. FiniThread;
  239. InitThread;
  240. end;
  241. use_iconv:=iconv_wide2ansi;
  242. free_iconv:=false;
  243. end
  244. else
  245. begin
  246. { TODO: add caching (then we also don't need separate code for
  247. the default system page and other ones)
  248. -- typecasting an ansistring function result to pchar is
  249. unsafe normally, but these are constant strings -> no
  250. problem }
  251. use_iconv:=iconv_open(pchar(win2iconv(cp)),unicode_encoding2);
  252. if assigned(iconvctl) then
  253. begin
  254. transliterate:=1;
  255. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  256. end;
  257. free_iconv:=true;
  258. end;
  259. { unsupported encoding -> default move }
  260. if use_iconv=iconv_t(-1) then
  261. begin
  262. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  263. exit;
  264. end;
  265. mynil:=nil;
  266. my0:=0;
  267. { rought estimation }
  268. setlength(dest,len*3);
  269. outlength:=len*3;
  270. srclen:=len*2;
  271. srcpos:=source;
  272. destpos:=pchar(dest);
  273. outleft:=outlength;
  274. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  275. begin
  276. err:=fpgetCerrno;
  277. case err of
  278. { last character is incomplete sequence }
  279. ESysEINVAL,
  280. { incomplete sequence in the middle }
  281. ESysEILSEQ:
  282. begin
  283. { skip and set to '?' }
  284. inc(srcpos);
  285. dec(srclen,2);
  286. destpos^:='?';
  287. inc(destpos);
  288. dec(outleft);
  289. { reset }
  290. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  291. if err=ESysEINVAL then
  292. break;
  293. end;
  294. ESysE2BIG:
  295. begin
  296. outoffset:=destpos-pchar(dest);
  297. { extend }
  298. setlength(dest,outlength+len*3);
  299. inc(outleft,len*3);
  300. inc(outlength,len*3);
  301. { string could have been moved }
  302. destpos:=pchar(dest)+outoffset;
  303. end;
  304. else
  305. runerror(231);
  306. end;
  307. end;
  308. // truncate string
  309. setlength(dest,length(dest)-outleft);
  310. SetCodePage(dest,cp,false);
  311. if free_iconv then
  312. iconv_close(use_iconv);
  313. end;
  314. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  315. var
  316. outlength,
  317. outoffset,
  318. outleft : size_t;
  319. use_iconv: iconv_t;
  320. srcpos,
  321. destpos: pchar;
  322. mynil : pchar;
  323. my0 : size_t;
  324. err: cint;
  325. free_iconv: boolean;
  326. begin
  327. if (cp=DefaultSystemCodePage) then
  328. begin
  329. { update iconv converter in case the DefaultSystemCodePage has been
  330. changed }
  331. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  332. begin
  333. FiniThread;
  334. InitThread;
  335. end;
  336. use_iconv:=iconv_ansi2wide;
  337. free_iconv:=false;
  338. end
  339. else
  340. begin
  341. { TODO: add caching (then we also don't need separate code for
  342. the default system page and other ones)
  343. -- typecasting an ansistring function result to pchar is
  344. unsafe normally, but these are constant strings -> no
  345. problem }
  346. use_iconv:=iconv_open(unicode_encoding2,pchar(win2iconv(cp)));
  347. free_iconv:=true;
  348. end;
  349. { unsupported encoding -> default move }
  350. if use_iconv=iconv_t(-1) then
  351. begin
  352. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  353. exit;
  354. end;
  355. mynil:=nil;
  356. my0:=0;
  357. // extra space
  358. outlength:=len+1;
  359. setlength(dest,outlength);
  360. srcpos:=source;
  361. destpos:=pchar(dest);
  362. outleft:=outlength*2;
  363. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  364. begin
  365. err:=fpgetCerrno;
  366. case err of
  367. ESysEINVAL,
  368. ESysEILSEQ:
  369. begin
  370. { skip and set to '?' }
  371. inc(srcpos);
  372. dec(len);
  373. pwidechar(destpos)^:='?';
  374. inc(destpos,2);
  375. dec(outleft,2);
  376. { reset }
  377. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  378. if err=ESysEINVAL then
  379. break;
  380. end;
  381. ESysE2BIG:
  382. begin
  383. outoffset:=destpos-pchar(dest);
  384. { extend }
  385. setlength(dest,outlength+len);
  386. inc(outleft,len*2);
  387. inc(outlength,len);
  388. { string could have been moved }
  389. destpos:=pchar(dest)+outoffset;
  390. end;
  391. else
  392. runerror(231);
  393. end;
  394. end;
  395. // truncate string
  396. setlength(dest,length(dest)-outleft div 2);
  397. if free_iconv then
  398. iconv_close(use_iconv);
  399. end;
  400. function LowerWideString(const s : WideString) : WideString;
  401. var
  402. i : SizeInt;
  403. begin
  404. SetLength(result,length(s));
  405. for i:=0 to length(s)-1 do
  406. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  407. end;
  408. function UpperWideString(const s : WideString) : WideString;
  409. var
  410. i : SizeInt;
  411. begin
  412. SetLength(result,length(s));
  413. for i:=0 to length(s)-1 do
  414. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  415. end;
  416. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  417. begin
  418. if (len>length(s)) then
  419. if (length(s) < 10*256) then
  420. setlength(s,length(s)+10)
  421. else
  422. setlength(s,length(s)+length(s) shr 8);
  423. end;
  424. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  425. begin
  426. EnsureAnsiLen(s,index);
  427. pchar(@s[index])^:=c;
  428. inc(index);
  429. end;
  430. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  431. {$ifndef beos}
  432. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  433. {$else not beos}
  434. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  435. {$endif beos}
  436. var
  437. p : pchar;
  438. mblen : size_t;
  439. begin
  440. { we know that s is unique -> avoid uniquestring calls}
  441. p:=@s[index];
  442. if (nc<=127) then
  443. ConcatCharToAnsiStr(char(nc),s,index)
  444. else
  445. begin
  446. EnsureAnsiLen(s,index+MB_CUR_MAX);
  447. {$ifndef beos}
  448. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  449. {$else not beos}
  450. mblen:=wctomb(p,wchar_t(nc));
  451. {$endif not beos}
  452. if (mblen<>size_t(-1)) then
  453. inc(index,mblen)
  454. else
  455. begin
  456. { invalid wide char }
  457. p^:='?';
  458. inc(index);
  459. end;
  460. end;
  461. end;
  462. function LowerAnsiString(const s : AnsiString) : AnsiString;
  463. var
  464. i, slen,
  465. resindex : SizeInt;
  466. mblen : size_t;
  467. {$ifndef beos}
  468. ombstate,
  469. nmbstate : mbstate_t;
  470. {$endif beos}
  471. wc : wchar_t;
  472. begin
  473. {$ifndef beos}
  474. fillchar(ombstate,sizeof(ombstate),0);
  475. fillchar(nmbstate,sizeof(nmbstate),0);
  476. {$endif beos}
  477. slen:=length(s);
  478. SetLength(result,slen+10);
  479. i:=1;
  480. resindex:=1;
  481. while (i<=slen) do
  482. begin
  483. if (s[i]<=#127) then
  484. begin
  485. wc:=wchar_t(s[i]);
  486. mblen:= 1;
  487. end
  488. else
  489. {$ifndef beos}
  490. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  491. {$else not beos}
  492. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  493. {$endif not beos}
  494. case mblen of
  495. size_t(-2):
  496. begin
  497. { partial invalid character, copy literally }
  498. while (i<=slen) do
  499. begin
  500. ConcatCharToAnsiStr(s[i],result,resindex);
  501. inc(i);
  502. end;
  503. end;
  504. size_t(-1), 0:
  505. begin
  506. { invalid or null character }
  507. ConcatCharToAnsiStr(s[i],result,resindex);
  508. inc(i);
  509. end;
  510. else
  511. begin
  512. { a valid sequence }
  513. { even if mblen = 1, the lowercase version may have a }
  514. { different length }
  515. { We can't do anything special if wchar_t is 16 bit... }
  516. {$ifndef beos}
  517. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  518. {$else not beos}
  519. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  520. {$endif not beos}
  521. inc(i,mblen);
  522. end;
  523. end;
  524. end;
  525. SetLength(result,resindex-1);
  526. end;
  527. function UpperAnsiString(const s : AnsiString) : AnsiString;
  528. var
  529. i, slen,
  530. resindex : SizeInt;
  531. mblen : size_t;
  532. {$ifndef beos}
  533. ombstate,
  534. nmbstate : mbstate_t;
  535. {$endif beos}
  536. wc : wchar_t;
  537. begin
  538. {$ifndef beos}
  539. fillchar(ombstate,sizeof(ombstate),0);
  540. fillchar(nmbstate,sizeof(nmbstate),0);
  541. {$endif beos}
  542. slen:=length(s);
  543. SetLength(result,slen+10);
  544. i:=1;
  545. resindex:=1;
  546. while (i<=slen) do
  547. begin
  548. if (s[i]<=#127) then
  549. begin
  550. wc:=wchar_t(s[i]);
  551. mblen:= 1;
  552. end
  553. else
  554. {$ifndef beos}
  555. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  556. {$else not beos}
  557. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  558. {$endif beos}
  559. case mblen of
  560. size_t(-2):
  561. begin
  562. { partial invalid character, copy literally }
  563. while (i<=slen) do
  564. begin
  565. ConcatCharToAnsiStr(s[i],result,resindex);
  566. inc(i);
  567. end;
  568. end;
  569. size_t(-1), 0:
  570. begin
  571. { invalid or null character }
  572. ConcatCharToAnsiStr(s[i],result,resindex);
  573. inc(i);
  574. end;
  575. else
  576. begin
  577. { a valid sequence }
  578. { even if mblen = 1, the uppercase version may have a }
  579. { different length }
  580. { We can't do anything special if wchar_t is 16 bit... }
  581. {$ifndef beos}
  582. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  583. {$else not beos}
  584. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  585. {$endif not beos}
  586. inc(i,mblen);
  587. end;
  588. end;
  589. end;
  590. SetLength(result,resindex-1);
  591. end;
  592. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  593. var
  594. i, slen,
  595. destindex : SizeInt;
  596. uch : UCS4Char;
  597. begin
  598. slen:=length(s);
  599. setlength(result,slen+1);
  600. i:=1;
  601. destindex:=0;
  602. while (i<=slen) do
  603. begin
  604. uch:=UCS4Char(s[i]);
  605. if (uch=0) then
  606. result[destindex]:=32
  607. else if (uch<=$d7ff) or (uch>=$e000) then
  608. result[destindex]:=uch
  609. else if (uch<=$dbff) and
  610. (i<slen) and
  611. (s[i+1]>=#$dc00) and
  612. (s[i+1]<=#$dfff) then
  613. begin
  614. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  615. inc(i);
  616. end
  617. else { invalid surrogate pair }
  618. result[destindex]:=uch;
  619. inc(i);
  620. inc(destindex);
  621. end;
  622. result[destindex]:=UCS4Char(0);
  623. { Trimming length in this particular case is just a waste of time,
  624. because result will be interpreted as null-terminated and discarded
  625. almost immediately }
  626. end;
  627. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  628. var
  629. hs1,hs2 : UCS4String;
  630. begin
  631. { wcscoll interprets null chars as end-of-string -> filter out }
  632. hs1:=WideStringToUCS4StringNoNulls(s1);
  633. hs2:=WideStringToUCS4StringNoNulls(s2);
  634. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  635. end;
  636. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  637. begin
  638. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  639. end;
  640. function CharLengthPChar(const Str: PChar): PtrInt;
  641. var
  642. nextlen: ptrint;
  643. s: pchar;
  644. {$ifndef beos}
  645. mbstate: mbstate_t;
  646. {$endif not beos}
  647. begin
  648. result:=0;
  649. s:=str;
  650. {$ifndef beos}
  651. fillchar(mbstate,sizeof(mbstate),0);
  652. {$endif not beos}
  653. repeat
  654. {$ifdef beos}
  655. nextlen:=ptrint(mblen(str,MB_CUR_MAX));
  656. {$else beos}
  657. nextlen:=ptrint(mbrlen(str,MB_CUR_MAX,@mbstate));
  658. {$endif beos}
  659. { skip invalid/incomplete sequences }
  660. if (nextlen<0) then
  661. nextlen:=1;
  662. inc(result,nextlen);
  663. inc(s,nextlen);
  664. until (nextlen=0);
  665. end;
  666. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  667. var
  668. nextlen: ptrint;
  669. {$ifndef beos}
  670. mbstate: mbstate_t;
  671. {$endif not beos}
  672. begin
  673. {$ifdef beos}
  674. result:=ptrint(mblen(str,maxlookahead));
  675. {$else beos}
  676. fillchar(mbstate,sizeof(mbstate),0);
  677. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  678. { mbrlen can also return -2 for "incomplete but potially valid character
  679. and data has been processed" }
  680. if result<0 then
  681. result:=-1;
  682. {$endif beos}
  683. end;
  684. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  685. var
  686. a,b: pchar;
  687. i: PtrInt;
  688. begin
  689. if not(canmodifys1) then
  690. getmem(a,len1+1)
  691. else
  692. a:=s1;
  693. for i:=0 to len1-1 do
  694. if s1[i]<>#0 then
  695. a[i]:=s1[i]
  696. else
  697. a[i]:=#32;
  698. a[len1]:=#0;
  699. if not(canmodifys2) then
  700. getmem(b,len2+1)
  701. else
  702. b:=s2;
  703. for i:=0 to len2-1 do
  704. if s2[i]<>#0 then
  705. b[i]:=s2[i]
  706. else
  707. b[i]:=#32;
  708. b[len2]:=#0;
  709. result:=strcoll(a,b);
  710. if not(canmodifys1) then
  711. freemem(a);
  712. if not(canmodifys2) then
  713. freemem(b);
  714. end;
  715. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  716. begin
  717. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  718. end;
  719. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  720. begin
  721. result:=strcoll(s1,s2);
  722. end;
  723. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  724. var
  725. a, b: AnsiString;
  726. begin
  727. a:=UpperAnsistring(s1);
  728. b:=UpperAnsistring(s2);
  729. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  730. end;
  731. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  732. begin
  733. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  734. end;
  735. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  736. var
  737. a, b: pchar;
  738. begin
  739. if (maxlen=0) then
  740. exit(0);
  741. if (s1[maxlen]<>#0) then
  742. begin
  743. getmem(a,maxlen+1);
  744. move(s1^,a^,maxlen);
  745. a[maxlen]:=#0;
  746. end
  747. else
  748. a:=s1;
  749. if (s2[maxlen]<>#0) then
  750. begin
  751. getmem(b,maxlen+1);
  752. move(s2^,b^,maxlen);
  753. b[maxlen]:=#0;
  754. end
  755. else
  756. b:=s2;
  757. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  758. if (a<>s1) then
  759. freemem(a);
  760. if (b<>s2) then
  761. freemem(b);
  762. end;
  763. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  764. var
  765. a, b: ansistring;
  766. begin
  767. if (maxlen=0) then
  768. exit(0);
  769. setlength(a,maxlen);
  770. move(s1^,a[1],maxlen);
  771. setlength(b,maxlen);
  772. move(s2^,b[1],maxlen);
  773. result:=AnsiCompareText(a,b);
  774. end;
  775. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  776. var
  777. newlen: sizeint;
  778. begin
  779. newlen:=length(s);
  780. if newlen>strlen(orgp) then
  781. fpc_rangeerror;
  782. p:=orgp;
  783. if (newlen>0) then
  784. move(s[1],p[0],newlen);
  785. p[newlen]:=#0;
  786. end;
  787. function AnsiStrLower(Str: PChar): PChar;
  788. var
  789. temp: ansistring;
  790. begin
  791. temp:=loweransistring(str);
  792. ansi2pchar(temp,str,result);
  793. end;
  794. function AnsiStrUpper(Str: PChar): PChar;
  795. var
  796. temp: ansistring;
  797. begin
  798. temp:=upperansistring(str);
  799. ansi2pchar(temp,str,result);
  800. end;
  801. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  802. var
  803. langinfo: pchar;
  804. begin
  805. langinfo:=nl_langinfo(CODESET);
  806. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  807. that causes it to return an empty string of UTF-8 locales
  808. -> patch up (and in general, UTF-8 is a good default on
  809. Unix platforms) }
  810. if not assigned(langinfo) or
  811. (langinfo^=#0) then
  812. langinfo:='UTF-8';
  813. Result := iconv2win(ansistring(langinfo));
  814. end;
  815. {$ifdef FPC_HAS_CPSTRING}
  816. {$i textrec.inc}
  817. procedure SetStdIOCodePage(var T: Text); inline;
  818. begin
  819. case TextRec(T).Mode of
  820. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  821. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  822. end;
  823. end;
  824. procedure SetStdIOCodePages; inline;
  825. begin
  826. SetStdIOCodePage(Input);
  827. SetStdIOCodePage(Output);
  828. SetStdIOCodePage(ErrOutput);
  829. SetStdIOCodePage(StdOut);
  830. SetStdIOCodePage(StdErr);
  831. end;
  832. {$endif FPC_HAS_CPSTRING}
  833. Procedure SetCWideStringManager;
  834. Var
  835. CWideStringManager : TUnicodeStringManager;
  836. begin
  837. CWideStringManager:=widestringmanager;
  838. With CWideStringManager do
  839. begin
  840. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  841. Ansi2WideMoveProc:=@Ansi2WideMove;
  842. UpperWideStringProc:=@UpperWideString;
  843. LowerWideStringProc:=@LowerWideString;
  844. CompareWideStringProc:=@CompareWideString;
  845. CompareTextWideStringProc:=@CompareTextWideString;
  846. CharLengthPCharProc:=@CharLengthPChar;
  847. CodePointLengthProc:=@CodePointLength;
  848. UpperAnsiStringProc:=@UpperAnsiString;
  849. LowerAnsiStringProc:=@LowerAnsiString;
  850. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  851. CompareTextAnsiStringProc:=@AnsiCompareText;
  852. StrCompAnsiStringProc:=@StrCompAnsi;
  853. StrICompAnsiStringProc:=@AnsiStrIComp;
  854. StrLCompAnsiStringProc:=@AnsiStrLComp;
  855. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  856. StrLowerAnsiStringProc:=@AnsiStrLower;
  857. StrUpperAnsiStringProc:=@AnsiStrUpper;
  858. ThreadInitProc:=@InitThread;
  859. ThreadFiniProc:=@FiniThread;
  860. { Unicode }
  861. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  862. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  863. UpperUnicodeStringProc:=@UpperWideString;
  864. LowerUnicodeStringProc:=@LowerWideString;
  865. CompareUnicodeStringProc:=@CompareWideString;
  866. CompareTextUnicodeStringProc:=@CompareTextWideString;
  867. { CodePage }
  868. GetStandardCodePageProc:=@GetStandardCodePage;
  869. end;
  870. SetUnicodeStringManager(CWideStringManager);
  871. end;
  872. var
  873. iconvlib:TLibHandle;
  874. initialization
  875. SetCWideStringManager;
  876. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  877. { with the information from the environment variables according to POSIX }
  878. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  879. setlocale(LC_ALL,'');
  880. { load iconvctl function }
  881. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  882. if iconvlib<>0 then
  883. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  884. { set the DefaultSystemCodePage }
  885. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  886. {$ifdef FPC_HAS_CPSTRING}
  887. SetStdIOCodePages;
  888. {$endif FPC_HAS_CPSTRING}
  889. { init conversion tables for main program }
  890. InitThread;
  891. finalization
  892. { fini conversion tables for main program }
  893. FiniThread;
  894. { unload iconv library }
  895. if iconvlib<>0 then
  896. FreeLibrary(iconvlib);
  897. end.