cwstring.pp 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. unit cwstring;
  15. interface
  16. procedure SetCWidestringManager;
  17. implementation
  18. {$linklib c}
  19. {$if not defined(linux) and not defined(solaris)} // Linux (and maybe glibc platforms in general), have iconv in glibc.
  20. {$if defined(haiku)}
  21. {$linklib textencoding}
  22. {$linklib locale}
  23. {$else}
  24. {$linklib iconv}
  25. {$endif}
  26. {$define useiconv}
  27. {$endif linux}
  28. Uses
  29. BaseUnix,
  30. ctypes,
  31. unix,
  32. unixtype,
  33. initc,
  34. dynlibs;
  35. Const
  36. {$ifndef useiconv}
  37. libiconvname='c'; // is in libc under Linux.
  38. libprefix='lib';
  39. {$else}
  40. {$ifdef haiku}
  41. libiconvname='textencoding'; // is in libtextencoding under Haiku
  42. libprefix='lib';
  43. {$else}
  44. {$if defined(darwin) or defined(aix)}
  45. libiconvname='libiconv';
  46. libprefix='';
  47. {$else}
  48. libiconvname='iconv';
  49. libprefix='lib';
  50. {$endif}
  51. {$endif}
  52. {$endif}
  53. { helper functions from libc }
  54. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  55. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  56. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  57. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  58. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  59. {$ifndef beos}
  60. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  61. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  62. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  63. {$else beos}
  64. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  65. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  66. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  67. {$endif beos}
  68. const
  69. {$if defined(linux)}
  70. __LC_CTYPE = 0;
  71. LC_ALL = 6;
  72. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  73. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  74. CODESET = _NL_CTYPE_CODESET_NAME;
  75. {$elseif defined(darwin)}
  76. CODESET = 0;
  77. LC_ALL = 0;
  78. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  79. __LC_CTYPE = 0;
  80. LC_ALL = 0;
  81. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  82. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  83. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  84. {$elseif defined(solaris)}
  85. CODESET=49;
  86. LC_ALL = 6;
  87. {$elseif defined(beos)}
  88. {$warning check correct value for BeOS}
  89. CODESET=49;
  90. {$ifdef haiku}
  91. LC_ALL = 0; // Checked for Haiku
  92. {$else}
  93. LC_ALL = 6; // Checked for BeOS
  94. {$endif}
  95. ESysEILSEQ = EILSEQ;
  96. {$elseif defined(OpenBSD)}
  97. CODESET = 51;
  98. LC_ALL = 0;
  99. {$elseif defined(NetBSD)}
  100. CODESET = 51;
  101. LC_ALL = 0;
  102. {$elseif defined(aix)}
  103. CODESET = 49;
  104. LC_ALL = -1;
  105. {$else not aix}
  106. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  107. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  108. {$endif}
  109. { unicode encoding name }
  110. {$ifdef FPC_LITTLE_ENDIAN}
  111. unicode_encoding2 = 'UTF-16LE';
  112. unicode_encoding4 = 'UCS-4LE';
  113. {$else FPC_LITTLE_ENDIAN}
  114. unicode_encoding2 = 'UTF-16BE';
  115. unicode_encoding4 = 'UCS-4BE';
  116. {$endif FPC_LITTLE_ENDIAN}
  117. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  118. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  119. { that's a libc macro mapped to internal functions/variables }
  120. { and thus not a stable external API on systems where libc }
  121. { breaks backwards compatibility every now and then }
  122. MB_CUR_MAX = 10;
  123. { Requests for iconvctl }
  124. ICONV_TRIVIALP = 0; // int *argument
  125. ICONV_GET_TRANSLITERATE = 1; // int *argument
  126. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  127. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  128. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  129. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  130. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  131. type
  132. piconv_t = ^iconv_t;
  133. iconv_t = pointer;
  134. nl_item = cint;
  135. {$ifdef haiku}
  136. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'locale' name 'nl_langinfo';
  137. {$else}
  138. {$ifndef beos}
  139. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  140. {$endif}
  141. {$endif}
  142. {$if (not defined(bsd) and not defined(beos)) or (defined(darwin) and not defined(cpupowerpc32))}
  143. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  144. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  145. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  146. const
  147. iconvctlname='iconvctl';
  148. {$else}
  149. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  150. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  151. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  152. const
  153. iconvctlname='libiconvctl';
  154. {$endif}
  155. var
  156. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  157. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  158. threadvar
  159. iconv_ansi2wide,
  160. iconv_wide2ansi : iconv_t;
  161. { since we cache the iconv_t converters, we have to do the same
  162. for the DefaultSystemCodePage variable since if it changes, we
  163. have to re-initialize the converters too. We can't do that via
  164. a callback in the widestring manager because DefaultSystemCodePage
  165. is not a threadvar and we can't automatically change this in all
  166. threads }
  167. current_DefaultSystemCodePage: TSystemCodePage;
  168. function win2iconv(cp: word): rawbytestring; forward;
  169. procedure InitThread;
  170. var
  171. transliterate: cint;
  172. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  173. iconvname: rawbytestring;
  174. {$endif}
  175. begin
  176. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  177. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  178. iconvname:=win2iconv(DefaultSystemCodePage);
  179. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  180. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  181. {$else}
  182. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  183. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  184. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  185. {$endif}
  186. if assigned(iconvctl) and
  187. (iconv_wide2ansi<>iconv_t(-1)) then
  188. begin
  189. transliterate:=1;
  190. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  191. end;
  192. end;
  193. procedure FiniThread;
  194. begin
  195. if (iconv_wide2ansi <> iconv_t(-1)) then
  196. iconv_close(iconv_wide2ansi);
  197. if (iconv_ansi2wide <> iconv_t(-1)) then
  198. iconv_close(iconv_ansi2wide);
  199. end;
  200. {$i winiconv.inc}
  201. {$if defined(beos) and not defined(haiku)}
  202. function nl_langinfo(__item:nl_item):pchar;
  203. begin
  204. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  205. // Now implement the minimum required to correctly initialize WideString support
  206. case __item of
  207. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  208. else
  209. begin
  210. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  211. Result := '';
  212. end;
  213. end;
  214. end;
  215. {$endif}
  216. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  217. var
  218. outlength,
  219. outoffset,
  220. srclen,
  221. outleft : size_t;
  222. use_iconv: iconv_t;
  223. srcpos : pwidechar;
  224. destpos: pchar;
  225. mynil : pchar;
  226. my0 : size_t;
  227. err,
  228. transliterate: cint;
  229. free_iconv: boolean;
  230. begin
  231. if (cp=DefaultSystemCodePage) then
  232. begin
  233. { update iconv converter in case the DefaultSystemCodePage has been
  234. changed }
  235. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  236. begin
  237. FiniThread;
  238. InitThread;
  239. end;
  240. use_iconv:=iconv_wide2ansi;
  241. free_iconv:=false;
  242. end
  243. else
  244. begin
  245. { TODO: add caching (then we also don't need separate code for
  246. the default system page and other ones)
  247. -- typecasting an ansistring function result to pchar is
  248. unsafe normally, but these are constant strings -> no
  249. problem }
  250. use_iconv:=iconv_open(pchar(win2iconv(cp)),unicode_encoding2);
  251. if assigned(iconvctl) then
  252. begin
  253. transliterate:=1;
  254. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  255. end;
  256. free_iconv:=true;
  257. end;
  258. { unsupported encoding -> default move }
  259. if use_iconv=iconv_t(-1) then
  260. begin
  261. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  262. exit;
  263. end;
  264. mynil:=nil;
  265. my0:=0;
  266. { rought estimation }
  267. setlength(dest,len*3);
  268. outlength:=len*3;
  269. srclen:=len*2;
  270. srcpos:=source;
  271. destpos:=pchar(dest);
  272. outleft:=outlength;
  273. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  274. begin
  275. err:=fpgetCerrno;
  276. case err of
  277. { last character is incomplete sequence }
  278. ESysEINVAL,
  279. { incomplete sequence in the middle }
  280. ESysEILSEQ:
  281. begin
  282. { skip and set to '?' }
  283. inc(srcpos);
  284. dec(srclen,2);
  285. destpos^:='?';
  286. inc(destpos);
  287. dec(outleft);
  288. { reset }
  289. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  290. if err=ESysEINVAL then
  291. break;
  292. end;
  293. ESysE2BIG:
  294. begin
  295. outoffset:=destpos-pchar(dest);
  296. { extend }
  297. setlength(dest,outlength+len*3);
  298. inc(outleft,len*3);
  299. inc(outlength,len*3);
  300. { string could have been moved }
  301. destpos:=pchar(dest)+outoffset;
  302. end;
  303. else
  304. runerror(231);
  305. end;
  306. end;
  307. // truncate string
  308. setlength(dest,length(dest)-outleft);
  309. SetCodePage(dest,cp,false);
  310. if free_iconv then
  311. iconv_close(use_iconv);
  312. end;
  313. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  314. var
  315. outlength,
  316. outoffset,
  317. outleft : size_t;
  318. use_iconv: iconv_t;
  319. srcpos,
  320. destpos: pchar;
  321. mynil : pchar;
  322. my0 : size_t;
  323. err: cint;
  324. free_iconv: boolean;
  325. begin
  326. if (cp=DefaultSystemCodePage) then
  327. begin
  328. { update iconv converter in case the DefaultSystemCodePage has been
  329. changed }
  330. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  331. begin
  332. FiniThread;
  333. InitThread;
  334. end;
  335. use_iconv:=iconv_ansi2wide;
  336. free_iconv:=false;
  337. end
  338. else
  339. begin
  340. { TODO: add caching (then we also don't need separate code for
  341. the default system page and other ones)
  342. -- typecasting an ansistring function result to pchar is
  343. unsafe normally, but these are constant strings -> no
  344. problem }
  345. use_iconv:=iconv_open(unicode_encoding2,pchar(win2iconv(cp)));
  346. free_iconv:=true;
  347. end;
  348. { unsupported encoding -> default move }
  349. if use_iconv=iconv_t(-1) then
  350. begin
  351. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  352. exit;
  353. end;
  354. mynil:=nil;
  355. my0:=0;
  356. // extra space
  357. outlength:=len+1;
  358. setlength(dest,outlength);
  359. srcpos:=source;
  360. destpos:=pchar(dest);
  361. outleft:=outlength*2;
  362. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  363. begin
  364. err:=fpgetCerrno;
  365. case err of
  366. ESysEINVAL,
  367. ESysEILSEQ:
  368. begin
  369. { skip and set to '?' }
  370. inc(srcpos);
  371. dec(len);
  372. pwidechar(destpos)^:='?';
  373. inc(destpos,2);
  374. dec(outleft,2);
  375. { reset }
  376. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  377. if err=ESysEINVAL then
  378. break;
  379. end;
  380. ESysE2BIG:
  381. begin
  382. outoffset:=destpos-pchar(dest);
  383. { extend }
  384. setlength(dest,outlength+len);
  385. inc(outleft,len*2);
  386. inc(outlength,len);
  387. { string could have been moved }
  388. destpos:=pchar(dest)+outoffset;
  389. end;
  390. else
  391. runerror(231);
  392. end;
  393. end;
  394. // truncate string
  395. setlength(dest,length(dest)-outleft div 2);
  396. if free_iconv then
  397. iconv_close(use_iconv);
  398. end;
  399. function LowerWideString(const s : WideString) : WideString;
  400. var
  401. i : SizeInt;
  402. begin
  403. SetLength(result,length(s));
  404. for i:=0 to length(s)-1 do
  405. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  406. end;
  407. function UpperWideString(const s : WideString) : WideString;
  408. var
  409. i : SizeInt;
  410. begin
  411. SetLength(result,length(s));
  412. for i:=0 to length(s)-1 do
  413. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  414. end;
  415. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  416. begin
  417. if (len>length(s)) then
  418. if (length(s) < 10*256) then
  419. setlength(s,length(s)+10)
  420. else
  421. setlength(s,length(s)+length(s) shr 8);
  422. end;
  423. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  424. begin
  425. EnsureAnsiLen(s,index);
  426. pchar(@s[index])^:=c;
  427. inc(index);
  428. end;
  429. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  430. {$ifndef beos}
  431. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  432. {$else not beos}
  433. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  434. {$endif beos}
  435. var
  436. p : pchar;
  437. mblen : size_t;
  438. begin
  439. { we know that s is unique -> avoid uniquestring calls}
  440. p:=@s[index];
  441. if (nc<=127) then
  442. ConcatCharToAnsiStr(char(nc),s,index)
  443. else
  444. begin
  445. EnsureAnsiLen(s,index+MB_CUR_MAX);
  446. {$ifndef beos}
  447. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  448. {$else not beos}
  449. mblen:=wctomb(p,wchar_t(nc));
  450. {$endif not beos}
  451. if (mblen<>size_t(-1)) then
  452. inc(index,mblen)
  453. else
  454. begin
  455. { invalid wide char }
  456. p^:='?';
  457. inc(index);
  458. end;
  459. end;
  460. end;
  461. function LowerAnsiString(const s : AnsiString) : AnsiString;
  462. var
  463. i, slen,
  464. resindex : SizeInt;
  465. mblen : size_t;
  466. {$ifndef beos}
  467. ombstate,
  468. nmbstate : mbstate_t;
  469. {$endif beos}
  470. wc : wchar_t;
  471. begin
  472. {$ifndef beos}
  473. fillchar(ombstate,sizeof(ombstate),0);
  474. fillchar(nmbstate,sizeof(nmbstate),0);
  475. {$endif beos}
  476. slen:=length(s);
  477. SetLength(result,slen+10);
  478. i:=1;
  479. resindex:=1;
  480. while (i<=slen) do
  481. begin
  482. if (s[i]<=#127) then
  483. begin
  484. wc:=wchar_t(s[i]);
  485. mblen:= 1;
  486. end
  487. else
  488. {$ifndef beos}
  489. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  490. {$else not beos}
  491. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  492. {$endif not beos}
  493. case mblen of
  494. size_t(-2):
  495. begin
  496. { partial invalid character, copy literally }
  497. while (i<=slen) do
  498. begin
  499. ConcatCharToAnsiStr(s[i],result,resindex);
  500. inc(i);
  501. end;
  502. end;
  503. size_t(-1), 0:
  504. begin
  505. { invalid or null character }
  506. ConcatCharToAnsiStr(s[i],result,resindex);
  507. inc(i);
  508. end;
  509. else
  510. begin
  511. { a valid sequence }
  512. { even if mblen = 1, the lowercase version may have a }
  513. { different length }
  514. { We can't do anything special if wchar_t is 16 bit... }
  515. {$ifndef beos}
  516. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  517. {$else not beos}
  518. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  519. {$endif not beos}
  520. inc(i,mblen);
  521. end;
  522. end;
  523. end;
  524. SetLength(result,resindex-1);
  525. end;
  526. function UpperAnsiString(const s : AnsiString) : AnsiString;
  527. var
  528. i, slen,
  529. resindex : SizeInt;
  530. mblen : size_t;
  531. {$ifndef beos}
  532. ombstate,
  533. nmbstate : mbstate_t;
  534. {$endif beos}
  535. wc : wchar_t;
  536. begin
  537. {$ifndef beos}
  538. fillchar(ombstate,sizeof(ombstate),0);
  539. fillchar(nmbstate,sizeof(nmbstate),0);
  540. {$endif beos}
  541. slen:=length(s);
  542. SetLength(result,slen+10);
  543. i:=1;
  544. resindex:=1;
  545. while (i<=slen) do
  546. begin
  547. if (s[i]<=#127) then
  548. begin
  549. wc:=wchar_t(s[i]);
  550. mblen:= 1;
  551. end
  552. else
  553. {$ifndef beos}
  554. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  555. {$else not beos}
  556. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  557. {$endif beos}
  558. case mblen of
  559. size_t(-2):
  560. begin
  561. { partial invalid character, copy literally }
  562. while (i<=slen) do
  563. begin
  564. ConcatCharToAnsiStr(s[i],result,resindex);
  565. inc(i);
  566. end;
  567. end;
  568. size_t(-1), 0:
  569. begin
  570. { invalid or null character }
  571. ConcatCharToAnsiStr(s[i],result,resindex);
  572. inc(i);
  573. end;
  574. else
  575. begin
  576. { a valid sequence }
  577. { even if mblen = 1, the uppercase version may have a }
  578. { different length }
  579. { We can't do anything special if wchar_t is 16 bit... }
  580. {$ifndef beos}
  581. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  582. {$else not beos}
  583. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  584. {$endif not beos}
  585. inc(i,mblen);
  586. end;
  587. end;
  588. end;
  589. SetLength(result,resindex-1);
  590. end;
  591. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  592. var
  593. i, slen,
  594. destindex : SizeInt;
  595. uch : UCS4Char;
  596. begin
  597. slen:=length(s);
  598. setlength(result,slen+1);
  599. i:=1;
  600. destindex:=0;
  601. while (i<=slen) do
  602. begin
  603. uch:=UCS4Char(s[i]);
  604. if (uch=0) then
  605. result[destindex]:=32
  606. else if (uch<=$d7ff) or (uch>=$e000) then
  607. result[destindex]:=uch
  608. else if (uch<=$dbff) and
  609. (i<slen) and
  610. (s[i+1]>=#$dc00) and
  611. (s[i+1]<=#$dfff) then
  612. begin
  613. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  614. inc(i);
  615. end
  616. else { invalid surrogate pair }
  617. result[destindex]:=uch;
  618. inc(i);
  619. inc(destindex);
  620. end;
  621. result[destindex]:=UCS4Char(0);
  622. { Trimming length in this particular case is just a waste of time,
  623. because result will be interpreted as null-terminated and discarded
  624. almost immediately }
  625. end;
  626. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  627. var
  628. hs1,hs2 : UCS4String;
  629. begin
  630. { wcscoll interprets null chars as end-of-string -> filter out }
  631. hs1:=WideStringToUCS4StringNoNulls(s1);
  632. hs2:=WideStringToUCS4StringNoNulls(s2);
  633. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  634. end;
  635. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  636. begin
  637. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  638. end;
  639. function CharLengthPChar(const Str: PChar): PtrInt;
  640. var
  641. nextlen: ptrint;
  642. s: pchar;
  643. {$ifndef beos}
  644. mbstate: mbstate_t;
  645. {$endif not beos}
  646. begin
  647. result:=0;
  648. s:=str;
  649. {$ifndef beos}
  650. fillchar(mbstate,sizeof(mbstate),0);
  651. {$endif not beos}
  652. repeat
  653. {$ifdef beos}
  654. nextlen:=ptrint(mblen(str,MB_CUR_MAX));
  655. {$else beos}
  656. nextlen:=ptrint(mbrlen(str,MB_CUR_MAX,@mbstate));
  657. {$endif beos}
  658. { skip invalid/incomplete sequences }
  659. if (nextlen<0) then
  660. nextlen:=1;
  661. inc(result,nextlen);
  662. inc(s,nextlen);
  663. until (nextlen=0);
  664. end;
  665. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  666. var
  667. nextlen: ptrint;
  668. {$ifndef beos}
  669. mbstate: mbstate_t;
  670. {$endif not beos}
  671. begin
  672. {$ifdef beos}
  673. result:=ptrint(mblen(str,maxlookahead));
  674. {$else beos}
  675. fillchar(mbstate,sizeof(mbstate),0);
  676. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  677. { mbrlen can also return -2 for "incomplete but potially valid character
  678. and data has been processed" }
  679. if result<0 then
  680. result:=-1;
  681. {$endif beos}
  682. end;
  683. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  684. var
  685. a,b: pchar;
  686. i: PtrInt;
  687. begin
  688. if not(canmodifys1) then
  689. getmem(a,len1+1)
  690. else
  691. a:=s1;
  692. for i:=0 to len1-1 do
  693. if s1[i]<>#0 then
  694. a[i]:=s1[i]
  695. else
  696. a[i]:=#32;
  697. a[len1]:=#0;
  698. if not(canmodifys2) then
  699. getmem(b,len2+1)
  700. else
  701. b:=s2;
  702. for i:=0 to len2-1 do
  703. if s2[i]<>#0 then
  704. b[i]:=s2[i]
  705. else
  706. b[i]:=#32;
  707. b[len2]:=#0;
  708. result:=strcoll(a,b);
  709. if not(canmodifys1) then
  710. freemem(a);
  711. if not(canmodifys2) then
  712. freemem(b);
  713. end;
  714. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  715. begin
  716. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  717. end;
  718. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  719. begin
  720. result:=strcoll(s1,s2);
  721. end;
  722. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  723. var
  724. a, b: AnsiString;
  725. begin
  726. a:=UpperAnsistring(s1);
  727. b:=UpperAnsistring(s2);
  728. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  729. end;
  730. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  731. begin
  732. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  733. end;
  734. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  735. var
  736. a, b: pchar;
  737. begin
  738. if (maxlen=0) then
  739. exit(0);
  740. if (s1[maxlen]<>#0) then
  741. begin
  742. getmem(a,maxlen+1);
  743. move(s1^,a^,maxlen);
  744. a[maxlen]:=#0;
  745. end
  746. else
  747. a:=s1;
  748. if (s2[maxlen]<>#0) then
  749. begin
  750. getmem(b,maxlen+1);
  751. move(s2^,b^,maxlen);
  752. b[maxlen]:=#0;
  753. end
  754. else
  755. b:=s2;
  756. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  757. if (a<>s1) then
  758. freemem(a);
  759. if (b<>s2) then
  760. freemem(b);
  761. end;
  762. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  763. var
  764. a, b: ansistring;
  765. begin
  766. if (maxlen=0) then
  767. exit(0);
  768. setlength(a,maxlen);
  769. move(s1^,a[1],maxlen);
  770. setlength(b,maxlen);
  771. move(s2^,b[1],maxlen);
  772. result:=AnsiCompareText(a,b);
  773. end;
  774. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  775. var
  776. newlen: sizeint;
  777. begin
  778. newlen:=length(s);
  779. if newlen>strlen(orgp) then
  780. fpc_rangeerror;
  781. p:=orgp;
  782. if (newlen>0) then
  783. move(s[1],p[0],newlen);
  784. p[newlen]:=#0;
  785. end;
  786. function AnsiStrLower(Str: PChar): PChar;
  787. var
  788. temp: ansistring;
  789. begin
  790. temp:=loweransistring(str);
  791. ansi2pchar(temp,str,result);
  792. end;
  793. function AnsiStrUpper(Str: PChar): PChar;
  794. var
  795. temp: ansistring;
  796. begin
  797. temp:=upperansistring(str);
  798. ansi2pchar(temp,str,result);
  799. end;
  800. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  801. var
  802. langinfo: pchar;
  803. begin
  804. langinfo:=nl_langinfo(CODESET);
  805. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  806. that causes it to return an empty string of UTF-8 locales
  807. -> patch up (and in general, UTF-8 is a good default on
  808. Unix platforms) }
  809. if not assigned(langinfo) or
  810. (langinfo^=#0) then
  811. langinfo:='UTF-8';
  812. Result := iconv2win(ansistring(langinfo));
  813. end;
  814. {$ifdef FPC_HAS_CPSTRING}
  815. {$i textrec.inc}
  816. procedure SetStdIOCodePage(var T: Text); inline;
  817. begin
  818. case TextRec(T).Mode of
  819. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  820. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  821. end;
  822. end;
  823. procedure SetStdIOCodePages; inline;
  824. begin
  825. SetStdIOCodePage(Input);
  826. SetStdIOCodePage(Output);
  827. SetStdIOCodePage(ErrOutput);
  828. SetStdIOCodePage(StdOut);
  829. SetStdIOCodePage(StdErr);
  830. end;
  831. {$endif FPC_HAS_CPSTRING}
  832. Procedure SetCWideStringManager;
  833. Var
  834. CWideStringManager : TUnicodeStringManager;
  835. begin
  836. CWideStringManager:=widestringmanager;
  837. With CWideStringManager do
  838. begin
  839. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  840. Ansi2WideMoveProc:=@Ansi2WideMove;
  841. UpperWideStringProc:=@UpperWideString;
  842. LowerWideStringProc:=@LowerWideString;
  843. CompareWideStringProc:=@CompareWideString;
  844. CompareTextWideStringProc:=@CompareTextWideString;
  845. CharLengthPCharProc:=@CharLengthPChar;
  846. CodePointLengthProc:=@CodePointLength;
  847. UpperAnsiStringProc:=@UpperAnsiString;
  848. LowerAnsiStringProc:=@LowerAnsiString;
  849. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  850. CompareTextAnsiStringProc:=@AnsiCompareText;
  851. StrCompAnsiStringProc:=@StrCompAnsi;
  852. StrICompAnsiStringProc:=@AnsiStrIComp;
  853. StrLCompAnsiStringProc:=@AnsiStrLComp;
  854. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  855. StrLowerAnsiStringProc:=@AnsiStrLower;
  856. StrUpperAnsiStringProc:=@AnsiStrUpper;
  857. ThreadInitProc:=@InitThread;
  858. ThreadFiniProc:=@FiniThread;
  859. { Unicode }
  860. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  861. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  862. UpperUnicodeStringProc:=@UpperWideString;
  863. LowerUnicodeStringProc:=@LowerWideString;
  864. CompareUnicodeStringProc:=@CompareWideString;
  865. CompareTextUnicodeStringProc:=@CompareTextWideString;
  866. { CodePage }
  867. GetStandardCodePageProc:=@GetStandardCodePage;
  868. end;
  869. SetUnicodeStringManager(CWideStringManager);
  870. end;
  871. var
  872. iconvlib:TLibHandle;
  873. initialization
  874. SetCWideStringManager;
  875. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  876. { with the information from the environment variables according to POSIX }
  877. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  878. setlocale(LC_ALL,'');
  879. { load iconvctl function }
  880. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  881. if iconvlib<>0 then
  882. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  883. { set the DefaultSystemCodePage }
  884. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  885. {$ifdef FPC_HAS_CPSTRING}
  886. SetStdIOCodePages;
  887. {$endif FPC_HAS_CPSTRING}
  888. { init conversion tables for main program }
  889. InitThread;
  890. finalization
  891. { fini conversion tables for main program }
  892. FiniThread;
  893. { unload iconv library }
  894. if iconvlib<>0 then
  895. FreeLibrary(iconvlib);
  896. end.