cwstring.pp 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. unit cwstring;
  15. interface
  16. procedure SetCWidestringManager;
  17. implementation
  18. {$linklib c}
  19. {$if not defined(linux) and not defined(solaris)} // Linux (and maybe glibc platforms in general), have iconv in glibc.
  20. {$if defined(haiku)}
  21. {$linklib textencoding}
  22. {$linklib locale}
  23. {$else}
  24. {$linklib iconv}
  25. {$endif}
  26. {$define useiconv}
  27. {$endif linux}
  28. Uses
  29. BaseUnix,
  30. ctypes,
  31. unix,
  32. unixtype,
  33. initc,
  34. dynlibs;
  35. Const
  36. {$ifndef useiconv}
  37. libiconvname='c'; // is in libc under Linux.
  38. {$else}
  39. {$ifdef haiku}
  40. libiconvname='textencoding'; // is in libtextencoding under Haiku
  41. {$else}
  42. {$ifdef darwin}
  43. libiconvname='libiconv';
  44. {$else}
  45. libiconvname='iconv';
  46. {$endif}
  47. {$endif}
  48. {$endif}
  49. { helper functions from libc }
  50. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  51. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  52. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  53. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  54. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  55. {$ifndef beos}
  56. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  57. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  58. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  59. {$else beos}
  60. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  61. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  62. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  63. {$endif beos}
  64. const
  65. {$ifdef linux}
  66. __LC_CTYPE = 0;
  67. LC_ALL = 6;
  68. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  69. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  70. CODESET = _NL_CTYPE_CODESET_NAME;
  71. {$else linux}
  72. {$ifdef darwin}
  73. CODESET = 0;
  74. LC_ALL = 0;
  75. {$else darwin}
  76. {$ifdef FreeBSD} // actually FreeBSD5. internationalisation is afaik not default on 4.
  77. __LC_CTYPE = 0;
  78. LC_ALL = 0;
  79. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  80. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  81. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  82. {$else freebsd}
  83. {$ifdef solaris}
  84. CODESET=49;
  85. LC_ALL = 6;
  86. {$else solaris}
  87. {$ifdef beos}
  88. {$warning check correct value for BeOS}
  89. CODESET=49;
  90. {$ifdef haiku}
  91. LC_ALL = 0; // Checked for Haiku
  92. {$else}
  93. LC_ALL = 6; // Checked for BeOS
  94. {$endif}
  95. ESysEILSEQ = EILSEQ;
  96. {$else}
  97. {$ifdef OpenBSD}
  98. CODESET = 51;
  99. LC_ALL = 0;
  100. {$else not OpenBSD}
  101. {$ifdef NetBSD}
  102. CODESET = 51;
  103. LC_ALL = 0;
  104. {$else not NetBSD}
  105. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  106. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  107. {$endif NetBSD}
  108. {$endif OpenBSD}
  109. {$endif beos}
  110. {$endif solaris}
  111. {$endif FreeBSD}
  112. {$endif darwin}
  113. {$endif linux}
  114. { unicode encoding name }
  115. {$ifdef FPC_LITTLE_ENDIAN}
  116. unicode_encoding2 = 'UTF-16LE';
  117. unicode_encoding4 = 'UCS-4LE';
  118. {$else FPC_LITTLE_ENDIAN}
  119. unicode_encoding2 = 'UTF-16BE';
  120. unicode_encoding4 = 'UCS-4BE';
  121. {$endif FPC_LITTLE_ENDIAN}
  122. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  123. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  124. { that's a libc macro mapped to internal functions/variables }
  125. { and thus not a stable external API on systems where libc }
  126. { breaks backwards compatibility every now and then }
  127. MB_CUR_MAX = 10;
  128. { Requests for iconvctl }
  129. ICONV_TRIVIALP = 0; // int *argument
  130. ICONV_GET_TRANSLITERATE = 1; // int *argument
  131. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  132. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  133. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  134. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  135. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  136. type
  137. piconv_t = ^iconv_t;
  138. iconv_t = pointer;
  139. nl_item = cint;
  140. {$ifdef haiku}
  141. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'locale' name 'nl_langinfo';
  142. {$else}
  143. {$ifndef beos}
  144. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  145. {$endif}
  146. {$endif}
  147. {$if (not defined(bsd) and not defined(beos)) or (defined(darwin) and not defined(cpupowerpc32))}
  148. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  149. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  150. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  151. const
  152. iconvctlname='iconvctl';
  153. {$else}
  154. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  155. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  156. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  157. const
  158. iconvctlname='libiconvctl';
  159. {$endif}
  160. var
  161. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  162. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  163. threadvar
  164. iconv_ansi2wide,
  165. iconv_wide2ansi : iconv_t;
  166. { since we cache the iconv_t converters, we have to do the same
  167. for the DefaultSystemCodePage variable since if it changes, we
  168. have to re-initialize the converters too. We can't do that via
  169. a callback in the widestring manager because DefaultSystemCodePage
  170. is not a threadvar and we can't automatically change this in all
  171. threads }
  172. current_DefaultSystemCodePage: TSystemCodePage;
  173. function win2iconv(cp: word): rawbytestring; forward;
  174. procedure InitThread;
  175. var
  176. transliterate: cint;
  177. {$if not(defined(darwin) and defined(cpuarm))}
  178. iconvname: rawbytestring;
  179. {$endif}
  180. begin
  181. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  182. {$if not(defined(darwin) and defined(cpuarm))}
  183. iconvname:=win2iconv(DefaultSystemCodePage);
  184. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  185. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  186. {$else}
  187. { Unix locale settings are ignored on iPhoneOS }
  188. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  189. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  190. {$endif}
  191. if assigned(iconvctl) and
  192. (iconv_wide2ansi<>iconv_t(-1)) then
  193. begin
  194. transliterate:=1;
  195. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  196. end;
  197. end;
  198. procedure FiniThread;
  199. begin
  200. if (iconv_wide2ansi <> iconv_t(-1)) then
  201. iconv_close(iconv_wide2ansi);
  202. if (iconv_ansi2wide <> iconv_t(-1)) then
  203. iconv_close(iconv_ansi2wide);
  204. end;
  205. {$i winiconv.inc}
  206. {$if defined(beos) and not defined(haiku)}
  207. function nl_langinfo(__item:nl_item):pchar;
  208. begin
  209. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  210. // Now implement the minimum required to correctly initialize WideString support
  211. case __item of
  212. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  213. else
  214. begin
  215. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  216. Result := '';
  217. end;
  218. end;
  219. end;
  220. {$endif}
  221. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  222. var
  223. outlength,
  224. outoffset,
  225. srclen,
  226. outleft : size_t;
  227. use_iconv: iconv_t;
  228. srcpos : pwidechar;
  229. destpos: pchar;
  230. mynil : pchar;
  231. my0 : size_t;
  232. err,
  233. transliterate: cint;
  234. free_iconv: boolean;
  235. begin
  236. if (cp=DefaultSystemCodePage) then
  237. begin
  238. { update iconv converter in case the DefaultSystemCodePage has been
  239. changed }
  240. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  241. begin
  242. FiniThread;
  243. InitThread;
  244. end;
  245. use_iconv:=iconv_wide2ansi;
  246. free_iconv:=false;
  247. end
  248. else
  249. begin
  250. { TODO: add caching (then we also don't need separate code for
  251. the default system page and other ones)
  252. -- typecasting an ansistring function result to pchar is
  253. unsafe normally, but these are constant strings -> no
  254. problem }
  255. use_iconv:=iconv_open(pchar(win2iconv(cp)),unicode_encoding2);
  256. if assigned(iconvctl) then
  257. begin
  258. transliterate:=1;
  259. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  260. end;
  261. free_iconv:=true;
  262. end;
  263. { unsupported encoding -> default move }
  264. if use_iconv=iconv_t(-1) then
  265. begin
  266. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  267. exit;
  268. end;
  269. mynil:=nil;
  270. my0:=0;
  271. { rought estimation }
  272. setlength(dest,len*3);
  273. outlength:=len*3;
  274. srclen:=len*2;
  275. srcpos:=source;
  276. destpos:=pchar(dest);
  277. outleft:=outlength;
  278. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  279. begin
  280. err:=fpgetCerrno;
  281. case err of
  282. { last character is incomplete sequence }
  283. ESysEINVAL,
  284. { incomplete sequence in the middle }
  285. ESysEILSEQ:
  286. begin
  287. { skip and set to '?' }
  288. inc(srcpos);
  289. dec(srclen,2);
  290. destpos^:='?';
  291. inc(destpos);
  292. dec(outleft);
  293. { reset }
  294. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  295. if err=ESysEINVAL then
  296. break;
  297. end;
  298. ESysE2BIG:
  299. begin
  300. outoffset:=destpos-pchar(dest);
  301. { extend }
  302. setlength(dest,outlength+len*3);
  303. inc(outleft,len*3);
  304. inc(outlength,len*3);
  305. { string could have been moved }
  306. destpos:=pchar(dest)+outoffset;
  307. end;
  308. else
  309. runerror(231);
  310. end;
  311. end;
  312. // truncate string
  313. setlength(dest,length(dest)-outleft);
  314. SetCodePage(dest,cp,false);
  315. if free_iconv then
  316. iconv_close(use_iconv);
  317. end;
  318. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  319. var
  320. outlength,
  321. outoffset,
  322. outleft : size_t;
  323. use_iconv: iconv_t;
  324. srcpos,
  325. destpos: pchar;
  326. mynil : pchar;
  327. my0 : size_t;
  328. err: cint;
  329. free_iconv: boolean;
  330. begin
  331. if (cp=DefaultSystemCodePage) then
  332. begin
  333. { update iconv converter in case the DefaultSystemCodePage has been
  334. changed }
  335. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  336. begin
  337. FiniThread;
  338. InitThread;
  339. end;
  340. use_iconv:=iconv_ansi2wide;
  341. free_iconv:=false;
  342. end
  343. else
  344. begin
  345. { TODO: add caching (then we also don't need separate code for
  346. the default system page and other ones)
  347. -- typecasting an ansistring function result to pchar is
  348. unsafe normally, but these are constant strings -> no
  349. problem }
  350. use_iconv:=iconv_open(unicode_encoding2,pchar(win2iconv(cp)));
  351. free_iconv:=true;
  352. end;
  353. { unsupported encoding -> default move }
  354. if use_iconv=iconv_t(-1) then
  355. begin
  356. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  357. exit;
  358. end;
  359. mynil:=nil;
  360. my0:=0;
  361. // extra space
  362. outlength:=len+1;
  363. setlength(dest,outlength);
  364. srcpos:=source;
  365. destpos:=pchar(dest);
  366. outleft:=outlength*2;
  367. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  368. begin
  369. err:=fpgetCerrno;
  370. case err of
  371. ESysEINVAL,
  372. ESysEILSEQ:
  373. begin
  374. { skip and set to '?' }
  375. inc(srcpos);
  376. dec(len);
  377. pwidechar(destpos)^:='?';
  378. inc(destpos,2);
  379. dec(outleft,2);
  380. { reset }
  381. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  382. if err=ESysEINVAL then
  383. break;
  384. end;
  385. ESysE2BIG:
  386. begin
  387. outoffset:=destpos-pchar(dest);
  388. { extend }
  389. setlength(dest,outlength+len);
  390. inc(outleft,len*2);
  391. inc(outlength,len);
  392. { string could have been moved }
  393. destpos:=pchar(dest)+outoffset;
  394. end;
  395. else
  396. runerror(231);
  397. end;
  398. end;
  399. // truncate string
  400. setlength(dest,length(dest)-outleft div 2);
  401. if free_iconv then
  402. iconv_close(use_iconv);
  403. end;
  404. function LowerWideString(const s : WideString) : WideString;
  405. var
  406. i : SizeInt;
  407. begin
  408. SetLength(result,length(s));
  409. for i:=0 to length(s)-1 do
  410. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  411. end;
  412. function UpperWideString(const s : WideString) : WideString;
  413. var
  414. i : SizeInt;
  415. begin
  416. SetLength(result,length(s));
  417. for i:=0 to length(s)-1 do
  418. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  419. end;
  420. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  421. begin
  422. if (len>length(s)) then
  423. if (length(s) < 10*256) then
  424. setlength(s,length(s)+10)
  425. else
  426. setlength(s,length(s)+length(s) shr 8);
  427. end;
  428. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  429. begin
  430. EnsureAnsiLen(s,index);
  431. pchar(@s[index])^:=c;
  432. inc(index);
  433. end;
  434. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  435. {$ifndef beos}
  436. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  437. {$else not beos}
  438. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  439. {$endif beos}
  440. var
  441. p : pchar;
  442. mblen : size_t;
  443. begin
  444. { we know that s is unique -> avoid uniquestring calls}
  445. p:=@s[index];
  446. if (nc<=127) then
  447. ConcatCharToAnsiStr(char(nc),s,index)
  448. else
  449. begin
  450. EnsureAnsiLen(s,index+MB_CUR_MAX);
  451. {$ifndef beos}
  452. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  453. {$else not beos}
  454. mblen:=wctomb(p,wchar_t(nc));
  455. {$endif not beos}
  456. if (mblen<>size_t(-1)) then
  457. inc(index,mblen)
  458. else
  459. begin
  460. { invalid wide char }
  461. p^:='?';
  462. inc(index);
  463. end;
  464. end;
  465. end;
  466. function LowerAnsiString(const s : AnsiString) : AnsiString;
  467. var
  468. i, slen,
  469. resindex : SizeInt;
  470. mblen : size_t;
  471. {$ifndef beos}
  472. ombstate,
  473. nmbstate : mbstate_t;
  474. {$endif beos}
  475. wc : wchar_t;
  476. begin
  477. {$ifndef beos}
  478. fillchar(ombstate,sizeof(ombstate),0);
  479. fillchar(nmbstate,sizeof(nmbstate),0);
  480. {$endif beos}
  481. slen:=length(s);
  482. SetLength(result,slen+10);
  483. i:=1;
  484. resindex:=1;
  485. while (i<=slen) do
  486. begin
  487. if (s[i]<=#127) then
  488. begin
  489. wc:=wchar_t(s[i]);
  490. mblen:= 1;
  491. end
  492. else
  493. {$ifndef beos}
  494. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  495. {$else not beos}
  496. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  497. {$endif not beos}
  498. case mblen of
  499. size_t(-2):
  500. begin
  501. { partial invalid character, copy literally }
  502. while (i<=slen) do
  503. begin
  504. ConcatCharToAnsiStr(s[i],result,resindex);
  505. inc(i);
  506. end;
  507. end;
  508. size_t(-1), 0:
  509. begin
  510. { invalid or null character }
  511. ConcatCharToAnsiStr(s[i],result,resindex);
  512. inc(i);
  513. end;
  514. else
  515. begin
  516. { a valid sequence }
  517. { even if mblen = 1, the lowercase version may have a }
  518. { different length }
  519. { We can't do anything special if wchar_t is 16 bit... }
  520. {$ifndef beos}
  521. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  522. {$else not beos}
  523. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  524. {$endif not beos}
  525. inc(i,mblen);
  526. end;
  527. end;
  528. end;
  529. SetLength(result,resindex-1);
  530. end;
  531. function UpperAnsiString(const s : AnsiString) : AnsiString;
  532. var
  533. i, slen,
  534. resindex : SizeInt;
  535. mblen : size_t;
  536. {$ifndef beos}
  537. ombstate,
  538. nmbstate : mbstate_t;
  539. {$endif beos}
  540. wc : wchar_t;
  541. begin
  542. {$ifndef beos}
  543. fillchar(ombstate,sizeof(ombstate),0);
  544. fillchar(nmbstate,sizeof(nmbstate),0);
  545. {$endif beos}
  546. slen:=length(s);
  547. SetLength(result,slen+10);
  548. i:=1;
  549. resindex:=1;
  550. while (i<=slen) do
  551. begin
  552. if (s[i]<=#127) then
  553. begin
  554. wc:=wchar_t(s[i]);
  555. mblen:= 1;
  556. end
  557. else
  558. {$ifndef beos}
  559. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  560. {$else not beos}
  561. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  562. {$endif beos}
  563. case mblen of
  564. size_t(-2):
  565. begin
  566. { partial invalid character, copy literally }
  567. while (i<=slen) do
  568. begin
  569. ConcatCharToAnsiStr(s[i],result,resindex);
  570. inc(i);
  571. end;
  572. end;
  573. size_t(-1), 0:
  574. begin
  575. { invalid or null character }
  576. ConcatCharToAnsiStr(s[i],result,resindex);
  577. inc(i);
  578. end;
  579. else
  580. begin
  581. { a valid sequence }
  582. { even if mblen = 1, the uppercase version may have a }
  583. { different length }
  584. { We can't do anything special if wchar_t is 16 bit... }
  585. {$ifndef beos}
  586. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  587. {$else not beos}
  588. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  589. {$endif not beos}
  590. inc(i,mblen);
  591. end;
  592. end;
  593. end;
  594. SetLength(result,resindex-1);
  595. end;
  596. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  597. var
  598. i, slen,
  599. destindex : SizeInt;
  600. uch : UCS4Char;
  601. begin
  602. slen:=length(s);
  603. setlength(result,slen+1);
  604. i:=1;
  605. destindex:=0;
  606. while (i<=slen) do
  607. begin
  608. uch:=UCS4Char(s[i]);
  609. if (uch=0) then
  610. result[destindex]:=32
  611. else if (uch<=$d7ff) or (uch>=$e000) then
  612. result[destindex]:=uch
  613. else if (uch<=$dbff) and
  614. (i<slen) and
  615. (s[i+1]>=#$dc00) and
  616. (s[i+1]<=#$dfff) then
  617. begin
  618. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  619. inc(i);
  620. end
  621. else { invalid surrogate pair }
  622. result[destindex]:=uch;
  623. inc(i);
  624. inc(destindex);
  625. end;
  626. result[destindex]:=UCS4Char(0);
  627. { Trimming length in this particular case is just a waste of time,
  628. because result will be interpreted as null-terminated and discarded
  629. almost immediately }
  630. end;
  631. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  632. var
  633. hs1,hs2 : UCS4String;
  634. begin
  635. { wcscoll interprets null chars as end-of-string -> filter out }
  636. hs1:=WideStringToUCS4StringNoNulls(s1);
  637. hs2:=WideStringToUCS4StringNoNulls(s2);
  638. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  639. end;
  640. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  641. begin
  642. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  643. end;
  644. function CharLengthPChar(const Str: PChar): PtrInt;
  645. var
  646. nextlen: ptrint;
  647. s: pchar;
  648. {$ifndef beos}
  649. mbstate: mbstate_t;
  650. {$endif not beos}
  651. begin
  652. result:=0;
  653. s:=str;
  654. {$ifndef beos}
  655. fillchar(mbstate,sizeof(mbstate),0);
  656. {$endif not beos}
  657. repeat
  658. {$ifdef beos}
  659. nextlen:=ptrint(mblen(str,MB_CUR_MAX));
  660. {$else beos}
  661. nextlen:=ptrint(mbrlen(str,MB_CUR_MAX,@mbstate));
  662. {$endif beos}
  663. { skip invalid/incomplete sequences }
  664. if (nextlen<0) then
  665. nextlen:=1;
  666. inc(result,nextlen);
  667. inc(s,nextlen);
  668. until (nextlen=0);
  669. end;
  670. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  671. var
  672. nextlen: ptrint;
  673. {$ifndef beos}
  674. mbstate: mbstate_t;
  675. {$endif not beos}
  676. begin
  677. {$ifdef beos}
  678. result:=ptrint(mblen(str,maxlookahead));
  679. {$else beos}
  680. fillchar(mbstate,sizeof(mbstate),0);
  681. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  682. { mbrlen can also return -2 for "incomplete but potially valid character
  683. and data has been processed" }
  684. if result<0 then
  685. result:=-1;
  686. {$endif beos}
  687. end;
  688. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  689. var
  690. a,b: pchar;
  691. i: PtrInt;
  692. begin
  693. if not(canmodifys1) then
  694. getmem(a,len1+1)
  695. else
  696. a:=s1;
  697. for i:=0 to len1-1 do
  698. if s1[i]<>#0 then
  699. a[i]:=s1[i]
  700. else
  701. a[i]:=#32;
  702. a[len1]:=#0;
  703. if not(canmodifys2) then
  704. getmem(b,len2+1)
  705. else
  706. b:=s2;
  707. for i:=0 to len2-1 do
  708. if s2[i]<>#0 then
  709. b[i]:=s2[i]
  710. else
  711. b[i]:=#32;
  712. b[len2]:=#0;
  713. result:=strcoll(a,b);
  714. if not(canmodifys1) then
  715. freemem(a);
  716. if not(canmodifys2) then
  717. freemem(b);
  718. end;
  719. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  720. begin
  721. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  722. end;
  723. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  724. begin
  725. result:=strcoll(s1,s2);
  726. end;
  727. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  728. var
  729. a, b: AnsiString;
  730. begin
  731. a:=UpperAnsistring(s1);
  732. b:=UpperAnsistring(s2);
  733. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  734. end;
  735. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  736. begin
  737. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  738. end;
  739. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  740. var
  741. a, b: pchar;
  742. begin
  743. if (maxlen=0) then
  744. exit(0);
  745. if (s1[maxlen]<>#0) then
  746. begin
  747. getmem(a,maxlen+1);
  748. move(s1^,a^,maxlen);
  749. a[maxlen]:=#0;
  750. end
  751. else
  752. a:=s1;
  753. if (s2[maxlen]<>#0) then
  754. begin
  755. getmem(b,maxlen+1);
  756. move(s2^,b^,maxlen);
  757. b[maxlen]:=#0;
  758. end
  759. else
  760. b:=s2;
  761. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  762. if (a<>s1) then
  763. freemem(a);
  764. if (b<>s2) then
  765. freemem(b);
  766. end;
  767. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  768. var
  769. a, b: ansistring;
  770. begin
  771. if (maxlen=0) then
  772. exit(0);
  773. setlength(a,maxlen);
  774. move(s1^,a[1],maxlen);
  775. setlength(b,maxlen);
  776. move(s2^,b[1],maxlen);
  777. result:=AnsiCompareText(a,b);
  778. end;
  779. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  780. var
  781. newlen: sizeint;
  782. begin
  783. newlen:=length(s);
  784. if newlen>strlen(orgp) then
  785. fpc_rangeerror;
  786. p:=orgp;
  787. if (newlen>0) then
  788. move(s[1],p[0],newlen);
  789. p[newlen]:=#0;
  790. end;
  791. function AnsiStrLower(Str: PChar): PChar;
  792. var
  793. temp: ansistring;
  794. begin
  795. temp:=loweransistring(str);
  796. ansi2pchar(temp,str,result);
  797. end;
  798. function AnsiStrUpper(Str: PChar): PChar;
  799. var
  800. temp: ansistring;
  801. begin
  802. temp:=upperansistring(str);
  803. ansi2pchar(temp,str,result);
  804. end;
  805. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  806. var
  807. langinfo: pchar;
  808. begin
  809. langinfo:=nl_langinfo(CODESET);
  810. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  811. that causes it to return an empty string of UTF-8 locales
  812. -> patch up (and in general, UTF-8 is a good default on
  813. Unix platforms) }
  814. if not assigned(langinfo) or
  815. (langinfo^=#0) then
  816. langinfo:='UTF-8';
  817. Result := iconv2win(ansistring(langinfo));
  818. end;
  819. {$ifdef FPC_HAS_CPSTRING}
  820. {$i textrec.inc}
  821. procedure SetStdIOCodePage(var T: Text); inline;
  822. begin
  823. case TextRec(T).Mode of
  824. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  825. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  826. end;
  827. end;
  828. procedure SetStdIOCodePages; inline;
  829. begin
  830. SetStdIOCodePage(Input);
  831. SetStdIOCodePage(Output);
  832. SetStdIOCodePage(ErrOutput);
  833. SetStdIOCodePage(StdOut);
  834. SetStdIOCodePage(StdErr);
  835. end;
  836. {$endif FPC_HAS_CPSTRING}
  837. Procedure SetCWideStringManager;
  838. Var
  839. CWideStringManager : TUnicodeStringManager;
  840. begin
  841. CWideStringManager:=widestringmanager;
  842. With CWideStringManager do
  843. begin
  844. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  845. Ansi2WideMoveProc:=@Ansi2WideMove;
  846. UpperWideStringProc:=@UpperWideString;
  847. LowerWideStringProc:=@LowerWideString;
  848. CompareWideStringProc:=@CompareWideString;
  849. CompareTextWideStringProc:=@CompareTextWideString;
  850. CharLengthPCharProc:=@CharLengthPChar;
  851. CodePointLengthProc:=@CodePointLength;
  852. UpperAnsiStringProc:=@UpperAnsiString;
  853. LowerAnsiStringProc:=@LowerAnsiString;
  854. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  855. CompareTextAnsiStringProc:=@AnsiCompareText;
  856. StrCompAnsiStringProc:=@StrCompAnsi;
  857. StrICompAnsiStringProc:=@AnsiStrIComp;
  858. StrLCompAnsiStringProc:=@AnsiStrLComp;
  859. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  860. StrLowerAnsiStringProc:=@AnsiStrLower;
  861. StrUpperAnsiStringProc:=@AnsiStrUpper;
  862. ThreadInitProc:=@InitThread;
  863. ThreadFiniProc:=@FiniThread;
  864. { Unicode }
  865. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  866. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  867. UpperUnicodeStringProc:=@UpperWideString;
  868. LowerUnicodeStringProc:=@LowerWideString;
  869. CompareUnicodeStringProc:=@CompareWideString;
  870. CompareTextUnicodeStringProc:=@CompareTextWideString;
  871. { CodePage }
  872. GetStandardCodePageProc:=@GetStandardCodePage;
  873. end;
  874. SetUnicodeStringManager(CWideStringManager);
  875. end;
  876. var
  877. iconvlib:TLibHandle;
  878. initialization
  879. SetCWideStringManager;
  880. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  881. { with the information from the environment variables according to POSIX }
  882. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  883. setlocale(LC_ALL,'');
  884. { load iconvctl function }
  885. iconvlib:=LoadLibrary(libiconvname+'.'+SharedSuffix);
  886. if iconvlib<>0 then
  887. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  888. { set the DefaultSystemCodePage }
  889. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  890. {$ifdef FPC_HAS_CPSTRING}
  891. SetStdIOCodePages;
  892. {$endif FPC_HAS_CPSTRING}
  893. { init conversion tables for main program }
  894. InitThread;
  895. finalization
  896. { fini conversion tables for main program }
  897. FiniThread;
  898. { unload iconv library }
  899. if iconvlib<>0 then
  900. FreeLibrary(iconvlib);
  901. end.