cwstring.pp 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. unit cwstring;
  15. interface
  16. procedure SetCWidestringManager;
  17. implementation
  18. {$linklib c}
  19. {$if not defined(linux) and not defined(solaris)} // Linux (and maybe glibc platforms in general), have iconv in glibc.
  20. {$if defined(haiku)}
  21. {$linklib textencoding}
  22. {$linklib locale}
  23. {$else}
  24. {$linklib iconv}
  25. {$endif}
  26. {$define useiconv}
  27. {$endif linux}
  28. Uses
  29. BaseUnix,
  30. ctypes,
  31. unix,
  32. unixtype,
  33. initc,
  34. dynlibs;
  35. Const
  36. {$ifndef useiconv}
  37. libiconvname='c'; // is in libc under Linux.
  38. {$else}
  39. {$ifdef haiku}
  40. libiconvname='textencoding'; // is in libtextencoding under Haiku
  41. {$else}
  42. {$ifdef darwin}
  43. libiconvname='libiconv';
  44. {$else}
  45. libiconvname='iconv';
  46. {$endif}
  47. {$endif}
  48. {$endif}
  49. { helper functions from libc }
  50. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  51. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  52. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  53. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  54. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  55. {$ifndef beos}
  56. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  57. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  58. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  59. {$else beos}
  60. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  61. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  62. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  63. {$endif beos}
  64. const
  65. {$ifdef linux}
  66. __LC_CTYPE = 0;
  67. LC_ALL = 6;
  68. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  69. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  70. CODESET = _NL_CTYPE_CODESET_NAME;
  71. {$else linux}
  72. {$ifdef darwin}
  73. CODESET = 0;
  74. LC_ALL = 0;
  75. {$else darwin}
  76. {$ifdef FreeBSD} // actually FreeBSD5. internationalisation is afaik not default on 4.
  77. __LC_CTYPE = 0;
  78. LC_ALL = 0;
  79. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  80. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  81. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  82. {$else freebsd}
  83. {$ifdef solaris}
  84. CODESET=49;
  85. LC_ALL = 6;
  86. {$else solaris}
  87. {$ifdef beos}
  88. {$warning check correct value for BeOS}
  89. CODESET=49;
  90. {$ifdef haiku}
  91. LC_ALL = 0; // Checked for Haiku
  92. {$else}
  93. LC_ALL = 6; // Checked for BeOS
  94. {$endif}
  95. ESysEILSEQ = EILSEQ;
  96. {$else}
  97. {$ifdef OpenBSD}
  98. CODESET = 51;
  99. LC_ALL = 0;
  100. {$else not OpenBSD}
  101. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  102. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  103. {$endif OpenBSD}
  104. {$endif beos}
  105. {$endif solaris}
  106. {$endif FreeBSD}
  107. {$endif darwin}
  108. {$endif linux}
  109. { unicode encoding name }
  110. {$ifdef FPC_LITTLE_ENDIAN}
  111. unicode_encoding2 = 'UTF-16LE';
  112. unicode_encoding4 = 'UCS-4LE';
  113. {$else FPC_LITTLE_ENDIAN}
  114. unicode_encoding2 = 'UTF-16BE';
  115. unicode_encoding4 = 'UCS-4BE';
  116. {$endif FPC_LITTLE_ENDIAN}
  117. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  118. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  119. { that's a libc macro mapped to internal functions/variables }
  120. { and thus not a stable external API on systems where libc }
  121. { breaks backwards compatibility every now and then }
  122. MB_CUR_MAX = 10;
  123. { Requests for iconvctl }
  124. ICONV_TRIVIALP = 0; // int *argument
  125. ICONV_GET_TRANSLITERATE = 1; // int *argument
  126. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  127. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  128. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  129. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  130. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  131. type
  132. piconv_t = ^iconv_t;
  133. iconv_t = pointer;
  134. nl_item = cint;
  135. {$ifdef haiku}
  136. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'locale' name 'nl_langinfo';
  137. {$else}
  138. {$ifndef beos}
  139. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  140. {$endif}
  141. {$endif}
  142. {$if (not defined(bsd) and not defined(beos)) or (defined(darwin) and not defined(cpupowerpc32))}
  143. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  144. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  145. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  146. const
  147. iconvctlname='iconvctl';
  148. {$else}
  149. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  150. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  151. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  152. const
  153. iconvctlname='libiconvctl';
  154. {$endif}
  155. var
  156. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  157. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  158. threadvar
  159. iconv_ansi2wide,
  160. iconv_wide2ansi : iconv_t;
  161. { since we cache the iconv_t converters, we have to do the same
  162. for the DefaultSystemCodePage variable since if it changes, we
  163. have to re-initialize the converters too. We can't do that via
  164. a callback in the widestring manager because DefaultSystemCodePage
  165. is not a threadvar and we can't automatically change this in all
  166. threads }
  167. current_DefaultSystemCodePage: TSystemCodePage;
  168. function win2iconv(cp: word): rawbytestring; forward;
  169. procedure InitThread;
  170. var
  171. transliterate: cint;
  172. {$if not(defined(darwin) and defined(cpuarm))}
  173. iconvname: rawbytestring;
  174. {$endif}
  175. begin
  176. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  177. {$if not(defined(darwin) and defined(cpuarm))}
  178. iconvname:=win2iconv(DefaultSystemCodePage);
  179. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  180. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  181. {$else}
  182. { Unix locale settings are ignored on iPhoneOS }
  183. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  184. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  185. {$endif}
  186. if assigned(iconvctl) then
  187. begin
  188. transliterate:=1;
  189. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  190. end;
  191. end;
  192. procedure FiniThread;
  193. begin
  194. if (iconv_wide2ansi <> iconv_t(-1)) then
  195. iconv_close(iconv_wide2ansi);
  196. if (iconv_ansi2wide <> iconv_t(-1)) then
  197. iconv_close(iconv_ansi2wide);
  198. end;
  199. {$i winiconv.inc}
  200. {$if defined(beos) and not defined(haiku)}
  201. function nl_langinfo(__item:nl_item):pchar;
  202. begin
  203. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  204. // Now implement the minimum required to correctly initialize WideString support
  205. case __item of
  206. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  207. else
  208. begin
  209. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  210. Result := '';
  211. end;
  212. end;
  213. end;
  214. {$endif}
  215. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  216. var
  217. outlength,
  218. outoffset,
  219. srclen,
  220. outleft : size_t;
  221. use_iconv: iconv_t;
  222. srcpos : pwidechar;
  223. destpos: pchar;
  224. mynil : pchar;
  225. my0 : size_t;
  226. err,
  227. transliterate: cint;
  228. free_iconv: boolean;
  229. begin
  230. if (cp=DefaultSystemCodePage) then
  231. begin
  232. { update iconv converter in case the DefaultSystemCodePage has been
  233. changed }
  234. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  235. begin
  236. FiniThread;
  237. InitThread;
  238. end;
  239. use_iconv:=iconv_wide2ansi;
  240. free_iconv:=false;
  241. end
  242. else
  243. begin
  244. { TODO: add caching (then we also don't need separate code for
  245. the default system page and other ones)
  246. -- typecasting an ansistring function result to pchar is
  247. unsafe normally, but these are constant strings -> no
  248. problem }
  249. use_iconv:=iconv_open(pchar(win2iconv(cp)),unicode_encoding2);
  250. if assigned(iconvctl) then
  251. begin
  252. transliterate:=1;
  253. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  254. end;
  255. free_iconv:=true;
  256. end;
  257. { unsupported encoding -> default move }
  258. if use_iconv=iconv_t(-1) then
  259. begin
  260. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  261. exit;
  262. end;
  263. mynil:=nil;
  264. my0:=0;
  265. { rought estimation }
  266. setlength(dest,len*3);
  267. SetCodePage(dest,cp,false);
  268. outlength:=len*3;
  269. srclen:=len*2;
  270. srcpos:=source;
  271. destpos:=pchar(dest);
  272. outleft:=outlength;
  273. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  274. begin
  275. err:=fpgetCerrno;
  276. case err of
  277. { last character is incomplete sequence }
  278. ESysEINVAL,
  279. { incomplete sequence in the middle }
  280. ESysEILSEQ:
  281. begin
  282. { skip and set to '?' }
  283. inc(srcpos);
  284. dec(srclen,2);
  285. destpos^:='?';
  286. inc(destpos);
  287. dec(outleft);
  288. { reset }
  289. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  290. if err=ESysEINVAL then
  291. break;
  292. end;
  293. ESysE2BIG:
  294. begin
  295. outoffset:=destpos-pchar(dest);
  296. { extend }
  297. setlength(dest,outlength+len*3);
  298. inc(outleft,len*3);
  299. inc(outlength,len*3);
  300. { string could have been moved }
  301. destpos:=pchar(dest)+outoffset;
  302. end;
  303. else
  304. runerror(231);
  305. end;
  306. end;
  307. // truncate string
  308. setlength(dest,length(dest)-outleft);
  309. if free_iconv then
  310. iconv_close(use_iconv);
  311. end;
  312. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  313. var
  314. outlength,
  315. outoffset,
  316. outleft : size_t;
  317. use_iconv: iconv_t;
  318. srcpos,
  319. destpos: pchar;
  320. mynil : pchar;
  321. my0 : size_t;
  322. err: cint;
  323. free_iconv: boolean;
  324. begin
  325. if (cp=DefaultSystemCodePage) then
  326. begin
  327. { update iconv converter in case the DefaultSystemCodePage has been
  328. changed }
  329. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  330. begin
  331. FiniThread;
  332. InitThread;
  333. end;
  334. use_iconv:=iconv_ansi2wide;
  335. free_iconv:=false;
  336. end
  337. else
  338. begin
  339. { TODO: add caching (then we also don't need separate code for
  340. the default system page and other ones)
  341. -- typecasting an ansistring function result to pchar is
  342. unsafe normally, but these are constant strings -> no
  343. problem }
  344. use_iconv:=iconv_open(unicode_encoding2,pchar(win2iconv(cp)));
  345. free_iconv:=true;
  346. end;
  347. { unsupported encoding -> default move }
  348. if use_iconv=iconv_t(-1) then
  349. begin
  350. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  351. exit;
  352. end;
  353. mynil:=nil;
  354. my0:=0;
  355. // extra space
  356. outlength:=len+1;
  357. setlength(dest,outlength);
  358. srcpos:=source;
  359. destpos:=pchar(dest);
  360. outleft:=outlength*2;
  361. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  362. begin
  363. err:=fpgetCerrno;
  364. case err of
  365. ESysEINVAL,
  366. ESysEILSEQ:
  367. begin
  368. { skip and set to '?' }
  369. inc(srcpos);
  370. dec(len);
  371. pwidechar(destpos)^:='?';
  372. inc(destpos,2);
  373. dec(outleft,2);
  374. { reset }
  375. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  376. if err=ESysEINVAL then
  377. break;
  378. end;
  379. ESysE2BIG:
  380. begin
  381. outoffset:=destpos-pchar(dest);
  382. { extend }
  383. setlength(dest,outlength+len);
  384. inc(outleft,len*2);
  385. inc(outlength,len);
  386. { string could have been moved }
  387. destpos:=pchar(dest)+outoffset;
  388. end;
  389. else
  390. runerror(231);
  391. end;
  392. end;
  393. // truncate string
  394. setlength(dest,length(dest)-outleft div 2);
  395. if free_iconv then
  396. iconv_close(use_iconv);
  397. end;
  398. function LowerWideString(const s : WideString) : WideString;
  399. var
  400. i : SizeInt;
  401. begin
  402. SetLength(result,length(s));
  403. for i:=0 to length(s)-1 do
  404. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  405. end;
  406. function UpperWideString(const s : WideString) : WideString;
  407. var
  408. i : SizeInt;
  409. begin
  410. SetLength(result,length(s));
  411. for i:=0 to length(s)-1 do
  412. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  413. end;
  414. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  415. begin
  416. if (len>length(s)) then
  417. if (length(s) < 10*256) then
  418. setlength(s,length(s)+10)
  419. else
  420. setlength(s,length(s)+length(s) shr 8);
  421. end;
  422. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  423. begin
  424. EnsureAnsiLen(s,index);
  425. pchar(@s[index])^:=c;
  426. inc(index);
  427. end;
  428. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  429. {$ifndef beos}
  430. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  431. {$else not beos}
  432. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  433. {$endif beos}
  434. var
  435. p : pchar;
  436. mblen : size_t;
  437. begin
  438. { we know that s is unique -> avoid uniquestring calls}
  439. p:=@s[index];
  440. if (nc<=127) then
  441. ConcatCharToAnsiStr(char(nc),s,index)
  442. else
  443. begin
  444. EnsureAnsiLen(s,index+MB_CUR_MAX);
  445. {$ifndef beos}
  446. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  447. {$else not beos}
  448. mblen:=wctomb(p,wchar_t(nc));
  449. {$endif not beos}
  450. if (mblen<>size_t(-1)) then
  451. inc(index,mblen)
  452. else
  453. begin
  454. { invalid wide char }
  455. p^:='?';
  456. inc(index);
  457. end;
  458. end;
  459. end;
  460. function LowerAnsiString(const s : AnsiString) : AnsiString;
  461. var
  462. i, slen,
  463. resindex : SizeInt;
  464. mblen : size_t;
  465. {$ifndef beos}
  466. ombstate,
  467. nmbstate : mbstate_t;
  468. {$endif beos}
  469. wc : wchar_t;
  470. begin
  471. {$ifndef beos}
  472. fillchar(ombstate,sizeof(ombstate),0);
  473. fillchar(nmbstate,sizeof(nmbstate),0);
  474. {$endif beos}
  475. slen:=length(s);
  476. SetLength(result,slen+10);
  477. i:=1;
  478. resindex:=1;
  479. while (i<=slen) do
  480. begin
  481. if (s[i]<=#127) then
  482. begin
  483. wc:=wchar_t(s[i]);
  484. mblen:= 1;
  485. end
  486. else
  487. {$ifndef beos}
  488. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  489. {$else not beos}
  490. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  491. {$endif not beos}
  492. case mblen of
  493. size_t(-2):
  494. begin
  495. { partial invalid character, copy literally }
  496. while (i<=slen) do
  497. begin
  498. ConcatCharToAnsiStr(s[i],result,resindex);
  499. inc(i);
  500. end;
  501. end;
  502. size_t(-1), 0:
  503. begin
  504. { invalid or null character }
  505. ConcatCharToAnsiStr(s[i],result,resindex);
  506. inc(i);
  507. end;
  508. else
  509. begin
  510. { a valid sequence }
  511. { even if mblen = 1, the lowercase version may have a }
  512. { different length }
  513. { We can't do anything special if wchar_t is 16 bit... }
  514. {$ifndef beos}
  515. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  516. {$else not beos}
  517. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  518. {$endif not beos}
  519. inc(i,mblen);
  520. end;
  521. end;
  522. end;
  523. SetLength(result,resindex-1);
  524. end;
  525. function UpperAnsiString(const s : AnsiString) : AnsiString;
  526. var
  527. i, slen,
  528. resindex : SizeInt;
  529. mblen : size_t;
  530. {$ifndef beos}
  531. ombstate,
  532. nmbstate : mbstate_t;
  533. {$endif beos}
  534. wc : wchar_t;
  535. begin
  536. {$ifndef beos}
  537. fillchar(ombstate,sizeof(ombstate),0);
  538. fillchar(nmbstate,sizeof(nmbstate),0);
  539. {$endif beos}
  540. slen:=length(s);
  541. SetLength(result,slen+10);
  542. i:=1;
  543. resindex:=1;
  544. while (i<=slen) do
  545. begin
  546. if (s[i]<=#127) then
  547. begin
  548. wc:=wchar_t(s[i]);
  549. mblen:= 1;
  550. end
  551. else
  552. {$ifndef beos}
  553. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  554. {$else not beos}
  555. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  556. {$endif beos}
  557. case mblen of
  558. size_t(-2):
  559. begin
  560. { partial invalid character, copy literally }
  561. while (i<=slen) do
  562. begin
  563. ConcatCharToAnsiStr(s[i],result,resindex);
  564. inc(i);
  565. end;
  566. end;
  567. size_t(-1), 0:
  568. begin
  569. { invalid or null character }
  570. ConcatCharToAnsiStr(s[i],result,resindex);
  571. inc(i);
  572. end;
  573. else
  574. begin
  575. { a valid sequence }
  576. { even if mblen = 1, the uppercase version may have a }
  577. { different length }
  578. { We can't do anything special if wchar_t is 16 bit... }
  579. {$ifndef beos}
  580. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  581. {$else not beos}
  582. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  583. {$endif not beos}
  584. inc(i,mblen);
  585. end;
  586. end;
  587. end;
  588. SetLength(result,resindex-1);
  589. end;
  590. function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char; external name 'FPC_UTF16TOUTF32';
  591. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  592. var
  593. i, slen,
  594. destindex : SizeInt;
  595. len : longint;
  596. uch : UCS4Char;
  597. begin
  598. slen:=length(s);
  599. setlength(result,slen+1);
  600. i:=1;
  601. destindex:=0;
  602. while (i<=slen) do
  603. begin
  604. uch:=utf16toutf32(s,i,len);
  605. if (uch=UCS4Char(0)) then
  606. uch:=UCS4Char(32);
  607. result[destindex]:=uch;
  608. inc(destindex);
  609. inc(i,len);
  610. end;
  611. result[destindex]:=UCS4Char(0);
  612. { destindex <= slen }
  613. setlength(result,destindex+1);
  614. end;
  615. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  616. var
  617. hs1,hs2 : UCS4String;
  618. begin
  619. { wcscoll interprets null chars as end-of-string -> filter out }
  620. hs1:=WideStringToUCS4StringNoNulls(s1);
  621. hs2:=WideStringToUCS4StringNoNulls(s2);
  622. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  623. end;
  624. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  625. begin
  626. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  627. end;
  628. function CharLengthPChar(const Str: PChar): PtrInt;
  629. var
  630. nextlen: ptrint;
  631. s: pchar;
  632. {$ifndef beos}
  633. mbstate: mbstate_t;
  634. {$endif not beos}
  635. begin
  636. result:=0;
  637. s:=str;
  638. {$ifndef beos}
  639. fillchar(mbstate,sizeof(mbstate),0);
  640. {$endif not beos}
  641. repeat
  642. {$ifdef beos}
  643. nextlen:=ptrint(mblen(str,MB_CUR_MAX));
  644. {$else beos}
  645. nextlen:=ptrint(mbrlen(str,MB_CUR_MAX,@mbstate));
  646. {$endif beos}
  647. { skip invalid/incomplete sequences }
  648. if (nextlen<0) then
  649. nextlen:=1;
  650. inc(result,nextlen);
  651. inc(s,nextlen);
  652. until (nextlen=0);
  653. end;
  654. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  655. var
  656. nextlen: ptrint;
  657. {$ifndef beos}
  658. mbstate: mbstate_t;
  659. {$endif not beos}
  660. begin
  661. {$ifdef beos}
  662. result:=ptrint(mblen(str,maxlookahead));
  663. {$else beos}
  664. fillchar(mbstate,sizeof(mbstate),0);
  665. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  666. { mbrlen can also return -2 for "incomplete but potially valid character
  667. and data has been processed" }
  668. if result<0 then
  669. result:=-1;
  670. {$endif beos}
  671. end;
  672. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  673. var
  674. a,b: pchar;
  675. i: PtrInt;
  676. begin
  677. if not(canmodifys1) then
  678. getmem(a,len1+1)
  679. else
  680. a:=s1;
  681. for i:=0 to len1-1 do
  682. if s1[i]<>#0 then
  683. a[i]:=s1[i]
  684. else
  685. a[i]:=#32;
  686. a[len1]:=#0;
  687. if not(canmodifys2) then
  688. getmem(b,len2+1)
  689. else
  690. b:=s2;
  691. for i:=0 to len2-1 do
  692. if s2[i]<>#0 then
  693. b[i]:=s2[i]
  694. else
  695. b[i]:=#32;
  696. b[len2]:=#0;
  697. result:=strcoll(a,b);
  698. if not(canmodifys1) then
  699. freemem(a);
  700. if not(canmodifys2) then
  701. freemem(b);
  702. end;
  703. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  704. begin
  705. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  706. end;
  707. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  708. begin
  709. result:=strcoll(s1,s2);
  710. end;
  711. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  712. var
  713. a, b: AnsiString;
  714. begin
  715. a:=UpperAnsistring(s1);
  716. b:=UpperAnsistring(s2);
  717. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  718. end;
  719. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  720. begin
  721. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  722. end;
  723. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  724. var
  725. a, b: pchar;
  726. begin
  727. if (maxlen=0) then
  728. exit(0);
  729. if (s1[maxlen]<>#0) then
  730. begin
  731. getmem(a,maxlen+1);
  732. move(s1^,a^,maxlen);
  733. a[maxlen]:=#0;
  734. end
  735. else
  736. a:=s1;
  737. if (s2[maxlen]<>#0) then
  738. begin
  739. getmem(b,maxlen+1);
  740. move(s2^,b^,maxlen);
  741. b[maxlen]:=#0;
  742. end
  743. else
  744. b:=s2;
  745. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  746. if (a<>s1) then
  747. freemem(a);
  748. if (b<>s2) then
  749. freemem(b);
  750. end;
  751. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  752. var
  753. a, b: ansistring;
  754. begin
  755. if (maxlen=0) then
  756. exit(0);
  757. setlength(a,maxlen);
  758. move(s1^,a[1],maxlen);
  759. setlength(b,maxlen);
  760. move(s2^,b[1],maxlen);
  761. result:=AnsiCompareText(a,b);
  762. end;
  763. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  764. var
  765. newlen: sizeint;
  766. begin
  767. newlen:=length(s);
  768. if newlen>strlen(orgp) then
  769. fpc_rangeerror;
  770. p:=orgp;
  771. if (newlen>0) then
  772. move(s[1],p[0],newlen);
  773. p[newlen]:=#0;
  774. end;
  775. function AnsiStrLower(Str: PChar): PChar;
  776. var
  777. temp: ansistring;
  778. begin
  779. temp:=loweransistring(str);
  780. ansi2pchar(temp,str,result);
  781. end;
  782. function AnsiStrUpper(Str: PChar): PChar;
  783. var
  784. temp: ansistring;
  785. begin
  786. temp:=upperansistring(str);
  787. ansi2pchar(temp,str,result);
  788. end;
  789. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  790. begin
  791. Result := iconv2win(ansistring(nl_langinfo(CODESET)))
  792. end;
  793. {$ifdef FPC_HAS_CPSTRING}
  794. {$i textrec.inc}
  795. procedure SetStdIOCodePage(var T: Text); inline;
  796. begin
  797. case TextRec(T).Mode of
  798. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  799. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  800. end;
  801. end;
  802. procedure SetStdIOCodePages; inline;
  803. begin
  804. SetStdIOCodePage(Input);
  805. SetStdIOCodePage(Output);
  806. SetStdIOCodePage(ErrOutput);
  807. SetStdIOCodePage(StdOut);
  808. SetStdIOCodePage(StdErr);
  809. end;
  810. {$endif FPC_HAS_CPSTRING}
  811. Procedure SetCWideStringManager;
  812. Var
  813. CWideStringManager : TUnicodeStringManager;
  814. begin
  815. CWideStringManager:=widestringmanager;
  816. With CWideStringManager do
  817. begin
  818. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  819. Ansi2WideMoveProc:=@Ansi2WideMove;
  820. UpperWideStringProc:=@UpperWideString;
  821. LowerWideStringProc:=@LowerWideString;
  822. CompareWideStringProc:=@CompareWideString;
  823. CompareTextWideStringProc:=@CompareTextWideString;
  824. CharLengthPCharProc:=@CharLengthPChar;
  825. CodePointLengthProc:=@CodePointLength;
  826. UpperAnsiStringProc:=@UpperAnsiString;
  827. LowerAnsiStringProc:=@LowerAnsiString;
  828. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  829. CompareTextAnsiStringProc:=@AnsiCompareText;
  830. StrCompAnsiStringProc:=@StrCompAnsi;
  831. StrICompAnsiStringProc:=@AnsiStrIComp;
  832. StrLCompAnsiStringProc:=@AnsiStrLComp;
  833. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  834. StrLowerAnsiStringProc:=@AnsiStrLower;
  835. StrUpperAnsiStringProc:=@AnsiStrUpper;
  836. ThreadInitProc:=@InitThread;
  837. ThreadFiniProc:=@FiniThread;
  838. { Unicode }
  839. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  840. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  841. UpperUnicodeStringProc:=@UpperWideString;
  842. LowerUnicodeStringProc:=@LowerWideString;
  843. CompareUnicodeStringProc:=@CompareWideString;
  844. CompareTextUnicodeStringProc:=@CompareTextWideString;
  845. { CodePage }
  846. GetStandardCodePageProc:=@GetStandardCodePage;
  847. end;
  848. SetUnicodeStringManager(CWideStringManager);
  849. end;
  850. var
  851. iconvlib:TLibHandle;
  852. initialization
  853. SetCWideStringManager;
  854. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  855. { with the information from the environment variables according to POSIX }
  856. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  857. setlocale(LC_ALL,'');
  858. { load iconvctl function }
  859. iconvlib:=LoadLibrary(libiconvname+'.'+SharedSuffix);
  860. if iconvlib<>0 then
  861. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  862. { set the DefaultSystemCodePage }
  863. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  864. {$ifdef FPC_HAS_CPSTRING}
  865. SetStdIOCodePages;
  866. {$endif FPC_HAS_CPSTRING}
  867. { init conversion tables for main program }
  868. InitThread;
  869. finalization
  870. { fini conversion tables for main program }
  871. FiniThread;
  872. { unload iconv library }
  873. if iconvlib<>0 then
  874. FreeLibrary(iconvlib);
  875. end.