cwstring.pp 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. unit cwstring;
  15. interface
  16. procedure SetCWidestringManager;
  17. implementation
  18. {$linklib c}
  19. {$if not defined(linux) and not defined(solaris)} // Linux (and maybe glibc platforms in general), have iconv in glibc.
  20. {$if defined(haiku)}
  21. {$linklib textencoding}
  22. {$linklib locale}
  23. {$else}
  24. {$linklib iconv}
  25. {$endif}
  26. {$define useiconv}
  27. {$endif linux}
  28. Uses
  29. BaseUnix,
  30. ctypes,
  31. unix,
  32. unixtype,
  33. initc,
  34. dynlibs;
  35. Const
  36. {$ifndef useiconv}
  37. libiconvname='c'; // is in libc under Linux.
  38. libprefix='lib';
  39. {$else}
  40. {$ifdef haiku}
  41. libiconvname='textencoding'; // is in libtextencoding under Haiku
  42. libprefix='lib';
  43. {$else}
  44. {$ifdef darwin}
  45. libiconvname='libiconv';
  46. libprefix='';
  47. {$else}
  48. libiconvname='iconv';
  49. libprefix='lib';
  50. {$endif}
  51. {$endif}
  52. {$endif}
  53. { helper functions from libc }
  54. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  55. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  56. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  57. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  58. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  59. {$ifndef beos}
  60. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  61. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  62. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  63. {$else beos}
  64. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  65. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  66. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  67. {$endif beos}
  68. const
  69. {$ifdef linux}
  70. __LC_CTYPE = 0;
  71. LC_ALL = 6;
  72. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  73. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  74. CODESET = _NL_CTYPE_CODESET_NAME;
  75. {$else linux}
  76. {$ifdef darwin}
  77. CODESET = 0;
  78. LC_ALL = 0;
  79. {$else darwin}
  80. {$ifdef FreeBSD} // actually FreeBSD5. internationalisation is afaik not default on 4.
  81. __LC_CTYPE = 0;
  82. LC_ALL = 0;
  83. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  84. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  85. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  86. {$else freebsd}
  87. {$ifdef solaris}
  88. CODESET=49;
  89. LC_ALL = 6;
  90. {$else solaris}
  91. {$ifdef beos}
  92. {$warning check correct value for BeOS}
  93. CODESET=49;
  94. {$ifdef haiku}
  95. LC_ALL = 0; // Checked for Haiku
  96. {$else}
  97. LC_ALL = 6; // Checked for BeOS
  98. {$endif}
  99. ESysEILSEQ = EILSEQ;
  100. {$else}
  101. {$ifdef OpenBSD}
  102. CODESET = 51;
  103. LC_ALL = 0;
  104. {$else not OpenBSD}
  105. {$ifdef NetBSD}
  106. CODESET = 51;
  107. LC_ALL = 0;
  108. {$else not NetBSD}
  109. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  110. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  111. {$endif NetBSD}
  112. {$endif OpenBSD}
  113. {$endif beos}
  114. {$endif solaris}
  115. {$endif FreeBSD}
  116. {$endif darwin}
  117. {$endif linux}
  118. { unicode encoding name }
  119. {$ifdef FPC_LITTLE_ENDIAN}
  120. unicode_encoding2 = 'UTF-16LE';
  121. unicode_encoding4 = 'UCS-4LE';
  122. {$else FPC_LITTLE_ENDIAN}
  123. unicode_encoding2 = 'UTF-16BE';
  124. unicode_encoding4 = 'UCS-4BE';
  125. {$endif FPC_LITTLE_ENDIAN}
  126. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  127. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  128. { that's a libc macro mapped to internal functions/variables }
  129. { and thus not a stable external API on systems where libc }
  130. { breaks backwards compatibility every now and then }
  131. MB_CUR_MAX = 10;
  132. { Requests for iconvctl }
  133. ICONV_TRIVIALP = 0; // int *argument
  134. ICONV_GET_TRANSLITERATE = 1; // int *argument
  135. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  136. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  137. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  138. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  139. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  140. type
  141. piconv_t = ^iconv_t;
  142. iconv_t = pointer;
  143. nl_item = cint;
  144. {$ifdef haiku}
  145. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'locale' name 'nl_langinfo';
  146. {$else}
  147. {$ifndef beos}
  148. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  149. {$endif}
  150. {$endif}
  151. {$if (not defined(bsd) and not defined(beos)) or (defined(darwin) and not defined(cpupowerpc32))}
  152. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  153. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  154. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  155. const
  156. iconvctlname='iconvctl';
  157. {$else}
  158. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  159. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  160. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  161. const
  162. iconvctlname='libiconvctl';
  163. {$endif}
  164. var
  165. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  166. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  167. threadvar
  168. iconv_ansi2wide,
  169. iconv_wide2ansi : iconv_t;
  170. { since we cache the iconv_t converters, we have to do the same
  171. for the DefaultSystemCodePage variable since if it changes, we
  172. have to re-initialize the converters too. We can't do that via
  173. a callback in the widestring manager because DefaultSystemCodePage
  174. is not a threadvar and we can't automatically change this in all
  175. threads }
  176. current_DefaultSystemCodePage: TSystemCodePage;
  177. function win2iconv(cp: word): rawbytestring; forward;
  178. procedure InitThread;
  179. var
  180. transliterate: cint;
  181. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  182. iconvname: rawbytestring;
  183. {$endif}
  184. begin
  185. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  186. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  187. iconvname:=win2iconv(DefaultSystemCodePage);
  188. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  189. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  190. {$else}
  191. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  192. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  193. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  194. {$endif}
  195. if assigned(iconvctl) and
  196. (iconv_wide2ansi<>iconv_t(-1)) then
  197. begin
  198. transliterate:=1;
  199. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  200. end;
  201. end;
  202. procedure FiniThread;
  203. begin
  204. if (iconv_wide2ansi <> iconv_t(-1)) then
  205. iconv_close(iconv_wide2ansi);
  206. if (iconv_ansi2wide <> iconv_t(-1)) then
  207. iconv_close(iconv_ansi2wide);
  208. end;
  209. {$i winiconv.inc}
  210. {$if defined(beos) and not defined(haiku)}
  211. function nl_langinfo(__item:nl_item):pchar;
  212. begin
  213. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  214. // Now implement the minimum required to correctly initialize WideString support
  215. case __item of
  216. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  217. else
  218. begin
  219. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  220. Result := '';
  221. end;
  222. end;
  223. end;
  224. {$endif}
  225. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  226. var
  227. outlength,
  228. outoffset,
  229. srclen,
  230. outleft : size_t;
  231. use_iconv: iconv_t;
  232. srcpos : pwidechar;
  233. destpos: pchar;
  234. mynil : pchar;
  235. my0 : size_t;
  236. err,
  237. transliterate: cint;
  238. free_iconv: boolean;
  239. begin
  240. if (cp=DefaultSystemCodePage) then
  241. begin
  242. { update iconv converter in case the DefaultSystemCodePage has been
  243. changed }
  244. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  245. begin
  246. FiniThread;
  247. InitThread;
  248. end;
  249. use_iconv:=iconv_wide2ansi;
  250. free_iconv:=false;
  251. end
  252. else
  253. begin
  254. { TODO: add caching (then we also don't need separate code for
  255. the default system page and other ones)
  256. -- typecasting an ansistring function result to pchar is
  257. unsafe normally, but these are constant strings -> no
  258. problem }
  259. use_iconv:=iconv_open(pchar(win2iconv(cp)),unicode_encoding2);
  260. if assigned(iconvctl) then
  261. begin
  262. transliterate:=1;
  263. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  264. end;
  265. free_iconv:=true;
  266. end;
  267. { unsupported encoding -> default move }
  268. if use_iconv=iconv_t(-1) then
  269. begin
  270. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  271. exit;
  272. end;
  273. mynil:=nil;
  274. my0:=0;
  275. { rought estimation }
  276. setlength(dest,len*3);
  277. outlength:=len*3;
  278. srclen:=len*2;
  279. srcpos:=source;
  280. destpos:=pchar(dest);
  281. outleft:=outlength;
  282. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  283. begin
  284. err:=fpgetCerrno;
  285. case err of
  286. { last character is incomplete sequence }
  287. ESysEINVAL,
  288. { incomplete sequence in the middle }
  289. ESysEILSEQ:
  290. begin
  291. { skip and set to '?' }
  292. inc(srcpos);
  293. dec(srclen,2);
  294. destpos^:='?';
  295. inc(destpos);
  296. dec(outleft);
  297. { reset }
  298. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  299. if err=ESysEINVAL then
  300. break;
  301. end;
  302. ESysE2BIG:
  303. begin
  304. outoffset:=destpos-pchar(dest);
  305. { extend }
  306. setlength(dest,outlength+len*3);
  307. inc(outleft,len*3);
  308. inc(outlength,len*3);
  309. { string could have been moved }
  310. destpos:=pchar(dest)+outoffset;
  311. end;
  312. else
  313. runerror(231);
  314. end;
  315. end;
  316. // truncate string
  317. setlength(dest,length(dest)-outleft);
  318. SetCodePage(dest,cp,false);
  319. if free_iconv then
  320. iconv_close(use_iconv);
  321. end;
  322. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  323. var
  324. outlength,
  325. outoffset,
  326. outleft : size_t;
  327. use_iconv: iconv_t;
  328. srcpos,
  329. destpos: pchar;
  330. mynil : pchar;
  331. my0 : size_t;
  332. err: cint;
  333. free_iconv: boolean;
  334. begin
  335. if (cp=DefaultSystemCodePage) then
  336. begin
  337. { update iconv converter in case the DefaultSystemCodePage has been
  338. changed }
  339. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  340. begin
  341. FiniThread;
  342. InitThread;
  343. end;
  344. use_iconv:=iconv_ansi2wide;
  345. free_iconv:=false;
  346. end
  347. else
  348. begin
  349. { TODO: add caching (then we also don't need separate code for
  350. the default system page and other ones)
  351. -- typecasting an ansistring function result to pchar is
  352. unsafe normally, but these are constant strings -> no
  353. problem }
  354. use_iconv:=iconv_open(unicode_encoding2,pchar(win2iconv(cp)));
  355. free_iconv:=true;
  356. end;
  357. { unsupported encoding -> default move }
  358. if use_iconv=iconv_t(-1) then
  359. begin
  360. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  361. exit;
  362. end;
  363. mynil:=nil;
  364. my0:=0;
  365. // extra space
  366. outlength:=len+1;
  367. setlength(dest,outlength);
  368. srcpos:=source;
  369. destpos:=pchar(dest);
  370. outleft:=outlength*2;
  371. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  372. begin
  373. err:=fpgetCerrno;
  374. case err of
  375. ESysEINVAL,
  376. ESysEILSEQ:
  377. begin
  378. { skip and set to '?' }
  379. inc(srcpos);
  380. dec(len);
  381. pwidechar(destpos)^:='?';
  382. inc(destpos,2);
  383. dec(outleft,2);
  384. { reset }
  385. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  386. if err=ESysEINVAL then
  387. break;
  388. end;
  389. ESysE2BIG:
  390. begin
  391. outoffset:=destpos-pchar(dest);
  392. { extend }
  393. setlength(dest,outlength+len);
  394. inc(outleft,len*2);
  395. inc(outlength,len);
  396. { string could have been moved }
  397. destpos:=pchar(dest)+outoffset;
  398. end;
  399. else
  400. runerror(231);
  401. end;
  402. end;
  403. // truncate string
  404. setlength(dest,length(dest)-outleft div 2);
  405. if free_iconv then
  406. iconv_close(use_iconv);
  407. end;
  408. function LowerWideString(const s : WideString) : WideString;
  409. var
  410. i : SizeInt;
  411. begin
  412. SetLength(result,length(s));
  413. for i:=0 to length(s)-1 do
  414. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  415. end;
  416. function UpperWideString(const s : WideString) : WideString;
  417. var
  418. i : SizeInt;
  419. begin
  420. SetLength(result,length(s));
  421. for i:=0 to length(s)-1 do
  422. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  423. end;
  424. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  425. begin
  426. if (len>length(s)) then
  427. if (length(s) < 10*256) then
  428. setlength(s,length(s)+10)
  429. else
  430. setlength(s,length(s)+length(s) shr 8);
  431. end;
  432. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  433. begin
  434. EnsureAnsiLen(s,index);
  435. pchar(@s[index])^:=c;
  436. inc(index);
  437. end;
  438. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  439. {$ifndef beos}
  440. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  441. {$else not beos}
  442. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  443. {$endif beos}
  444. var
  445. p : pchar;
  446. mblen : size_t;
  447. begin
  448. { we know that s is unique -> avoid uniquestring calls}
  449. p:=@s[index];
  450. if (nc<=127) then
  451. ConcatCharToAnsiStr(char(nc),s,index)
  452. else
  453. begin
  454. EnsureAnsiLen(s,index+MB_CUR_MAX);
  455. {$ifndef beos}
  456. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  457. {$else not beos}
  458. mblen:=wctomb(p,wchar_t(nc));
  459. {$endif not beos}
  460. if (mblen<>size_t(-1)) then
  461. inc(index,mblen)
  462. else
  463. begin
  464. { invalid wide char }
  465. p^:='?';
  466. inc(index);
  467. end;
  468. end;
  469. end;
  470. function LowerAnsiString(const s : AnsiString) : AnsiString;
  471. var
  472. i, slen,
  473. resindex : SizeInt;
  474. mblen : size_t;
  475. {$ifndef beos}
  476. ombstate,
  477. nmbstate : mbstate_t;
  478. {$endif beos}
  479. wc : wchar_t;
  480. begin
  481. {$ifndef beos}
  482. fillchar(ombstate,sizeof(ombstate),0);
  483. fillchar(nmbstate,sizeof(nmbstate),0);
  484. {$endif beos}
  485. slen:=length(s);
  486. SetLength(result,slen+10);
  487. i:=1;
  488. resindex:=1;
  489. while (i<=slen) do
  490. begin
  491. if (s[i]<=#127) then
  492. begin
  493. wc:=wchar_t(s[i]);
  494. mblen:= 1;
  495. end
  496. else
  497. {$ifndef beos}
  498. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  499. {$else not beos}
  500. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  501. {$endif not beos}
  502. case mblen of
  503. size_t(-2):
  504. begin
  505. { partial invalid character, copy literally }
  506. while (i<=slen) do
  507. begin
  508. ConcatCharToAnsiStr(s[i],result,resindex);
  509. inc(i);
  510. end;
  511. end;
  512. size_t(-1), 0:
  513. begin
  514. { invalid or null character }
  515. ConcatCharToAnsiStr(s[i],result,resindex);
  516. inc(i);
  517. end;
  518. else
  519. begin
  520. { a valid sequence }
  521. { even if mblen = 1, the lowercase version may have a }
  522. { different length }
  523. { We can't do anything special if wchar_t is 16 bit... }
  524. {$ifndef beos}
  525. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  526. {$else not beos}
  527. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  528. {$endif not beos}
  529. inc(i,mblen);
  530. end;
  531. end;
  532. end;
  533. SetLength(result,resindex-1);
  534. end;
  535. function UpperAnsiString(const s : AnsiString) : AnsiString;
  536. var
  537. i, slen,
  538. resindex : SizeInt;
  539. mblen : size_t;
  540. {$ifndef beos}
  541. ombstate,
  542. nmbstate : mbstate_t;
  543. {$endif beos}
  544. wc : wchar_t;
  545. begin
  546. {$ifndef beos}
  547. fillchar(ombstate,sizeof(ombstate),0);
  548. fillchar(nmbstate,sizeof(nmbstate),0);
  549. {$endif beos}
  550. slen:=length(s);
  551. SetLength(result,slen+10);
  552. i:=1;
  553. resindex:=1;
  554. while (i<=slen) do
  555. begin
  556. if (s[i]<=#127) then
  557. begin
  558. wc:=wchar_t(s[i]);
  559. mblen:= 1;
  560. end
  561. else
  562. {$ifndef beos}
  563. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  564. {$else not beos}
  565. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  566. {$endif beos}
  567. case mblen of
  568. size_t(-2):
  569. begin
  570. { partial invalid character, copy literally }
  571. while (i<=slen) do
  572. begin
  573. ConcatCharToAnsiStr(s[i],result,resindex);
  574. inc(i);
  575. end;
  576. end;
  577. size_t(-1), 0:
  578. begin
  579. { invalid or null character }
  580. ConcatCharToAnsiStr(s[i],result,resindex);
  581. inc(i);
  582. end;
  583. else
  584. begin
  585. { a valid sequence }
  586. { even if mblen = 1, the uppercase version may have a }
  587. { different length }
  588. { We can't do anything special if wchar_t is 16 bit... }
  589. {$ifndef beos}
  590. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  591. {$else not beos}
  592. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  593. {$endif not beos}
  594. inc(i,mblen);
  595. end;
  596. end;
  597. end;
  598. SetLength(result,resindex-1);
  599. end;
  600. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  601. var
  602. i, slen,
  603. destindex : SizeInt;
  604. uch : UCS4Char;
  605. begin
  606. slen:=length(s);
  607. setlength(result,slen+1);
  608. i:=1;
  609. destindex:=0;
  610. while (i<=slen) do
  611. begin
  612. uch:=UCS4Char(s[i]);
  613. if (uch=0) then
  614. result[destindex]:=32
  615. else if (uch<=$d7ff) or (uch>=$e000) then
  616. result[destindex]:=uch
  617. else if (uch<=$dbff) and
  618. (i<slen) and
  619. (s[i+1]>=#$dc00) and
  620. (s[i+1]<=#$dfff) then
  621. begin
  622. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  623. inc(i);
  624. end
  625. else { invalid surrogate pair }
  626. result[destindex]:=uch;
  627. inc(i);
  628. inc(destindex);
  629. end;
  630. result[destindex]:=UCS4Char(0);
  631. { Trimming length in this particular case is just a waste of time,
  632. because result will be interpreted as null-terminated and discarded
  633. almost immediately }
  634. end;
  635. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  636. var
  637. hs1,hs2 : UCS4String;
  638. begin
  639. { wcscoll interprets null chars as end-of-string -> filter out }
  640. hs1:=WideStringToUCS4StringNoNulls(s1);
  641. hs2:=WideStringToUCS4StringNoNulls(s2);
  642. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  643. end;
  644. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  645. begin
  646. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  647. end;
  648. function CharLengthPChar(const Str: PChar): PtrInt;
  649. var
  650. nextlen: ptrint;
  651. s: pchar;
  652. {$ifndef beos}
  653. mbstate: mbstate_t;
  654. {$endif not beos}
  655. begin
  656. result:=0;
  657. s:=str;
  658. {$ifndef beos}
  659. fillchar(mbstate,sizeof(mbstate),0);
  660. {$endif not beos}
  661. repeat
  662. {$ifdef beos}
  663. nextlen:=ptrint(mblen(str,MB_CUR_MAX));
  664. {$else beos}
  665. nextlen:=ptrint(mbrlen(str,MB_CUR_MAX,@mbstate));
  666. {$endif beos}
  667. { skip invalid/incomplete sequences }
  668. if (nextlen<0) then
  669. nextlen:=1;
  670. inc(result,nextlen);
  671. inc(s,nextlen);
  672. until (nextlen=0);
  673. end;
  674. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  675. var
  676. nextlen: ptrint;
  677. {$ifndef beos}
  678. mbstate: mbstate_t;
  679. {$endif not beos}
  680. begin
  681. {$ifdef beos}
  682. result:=ptrint(mblen(str,maxlookahead));
  683. {$else beos}
  684. fillchar(mbstate,sizeof(mbstate),0);
  685. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  686. { mbrlen can also return -2 for "incomplete but potially valid character
  687. and data has been processed" }
  688. if result<0 then
  689. result:=-1;
  690. {$endif beos}
  691. end;
  692. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  693. var
  694. a,b: pchar;
  695. i: PtrInt;
  696. begin
  697. if not(canmodifys1) then
  698. getmem(a,len1+1)
  699. else
  700. a:=s1;
  701. for i:=0 to len1-1 do
  702. if s1[i]<>#0 then
  703. a[i]:=s1[i]
  704. else
  705. a[i]:=#32;
  706. a[len1]:=#0;
  707. if not(canmodifys2) then
  708. getmem(b,len2+1)
  709. else
  710. b:=s2;
  711. for i:=0 to len2-1 do
  712. if s2[i]<>#0 then
  713. b[i]:=s2[i]
  714. else
  715. b[i]:=#32;
  716. b[len2]:=#0;
  717. result:=strcoll(a,b);
  718. if not(canmodifys1) then
  719. freemem(a);
  720. if not(canmodifys2) then
  721. freemem(b);
  722. end;
  723. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  724. begin
  725. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  726. end;
  727. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  728. begin
  729. result:=strcoll(s1,s2);
  730. end;
  731. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  732. var
  733. a, b: AnsiString;
  734. begin
  735. a:=UpperAnsistring(s1);
  736. b:=UpperAnsistring(s2);
  737. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  738. end;
  739. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  740. begin
  741. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  742. end;
  743. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  744. var
  745. a, b: pchar;
  746. begin
  747. if (maxlen=0) then
  748. exit(0);
  749. if (s1[maxlen]<>#0) then
  750. begin
  751. getmem(a,maxlen+1);
  752. move(s1^,a^,maxlen);
  753. a[maxlen]:=#0;
  754. end
  755. else
  756. a:=s1;
  757. if (s2[maxlen]<>#0) then
  758. begin
  759. getmem(b,maxlen+1);
  760. move(s2^,b^,maxlen);
  761. b[maxlen]:=#0;
  762. end
  763. else
  764. b:=s2;
  765. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  766. if (a<>s1) then
  767. freemem(a);
  768. if (b<>s2) then
  769. freemem(b);
  770. end;
  771. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  772. var
  773. a, b: ansistring;
  774. begin
  775. if (maxlen=0) then
  776. exit(0);
  777. setlength(a,maxlen);
  778. move(s1^,a[1],maxlen);
  779. setlength(b,maxlen);
  780. move(s2^,b[1],maxlen);
  781. result:=AnsiCompareText(a,b);
  782. end;
  783. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  784. var
  785. newlen: sizeint;
  786. begin
  787. newlen:=length(s);
  788. if newlen>strlen(orgp) then
  789. fpc_rangeerror;
  790. p:=orgp;
  791. if (newlen>0) then
  792. move(s[1],p[0],newlen);
  793. p[newlen]:=#0;
  794. end;
  795. function AnsiStrLower(Str: PChar): PChar;
  796. var
  797. temp: ansistring;
  798. begin
  799. temp:=loweransistring(str);
  800. ansi2pchar(temp,str,result);
  801. end;
  802. function AnsiStrUpper(Str: PChar): PChar;
  803. var
  804. temp: ansistring;
  805. begin
  806. temp:=upperansistring(str);
  807. ansi2pchar(temp,str,result);
  808. end;
  809. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  810. var
  811. langinfo: pchar;
  812. begin
  813. langinfo:=nl_langinfo(CODESET);
  814. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  815. that causes it to return an empty string of UTF-8 locales
  816. -> patch up (and in general, UTF-8 is a good default on
  817. Unix platforms) }
  818. if not assigned(langinfo) or
  819. (langinfo^=#0) then
  820. langinfo:='UTF-8';
  821. Result := iconv2win(ansistring(langinfo));
  822. end;
  823. {$ifdef FPC_HAS_CPSTRING}
  824. {$i textrec.inc}
  825. procedure SetStdIOCodePage(var T: Text); inline;
  826. begin
  827. case TextRec(T).Mode of
  828. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  829. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  830. end;
  831. end;
  832. procedure SetStdIOCodePages; inline;
  833. begin
  834. SetStdIOCodePage(Input);
  835. SetStdIOCodePage(Output);
  836. SetStdIOCodePage(ErrOutput);
  837. SetStdIOCodePage(StdOut);
  838. SetStdIOCodePage(StdErr);
  839. end;
  840. {$endif FPC_HAS_CPSTRING}
  841. Procedure SetCWideStringManager;
  842. Var
  843. CWideStringManager : TUnicodeStringManager;
  844. begin
  845. CWideStringManager:=widestringmanager;
  846. With CWideStringManager do
  847. begin
  848. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  849. Ansi2WideMoveProc:=@Ansi2WideMove;
  850. UpperWideStringProc:=@UpperWideString;
  851. LowerWideStringProc:=@LowerWideString;
  852. CompareWideStringProc:=@CompareWideString;
  853. CompareTextWideStringProc:=@CompareTextWideString;
  854. CharLengthPCharProc:=@CharLengthPChar;
  855. CodePointLengthProc:=@CodePointLength;
  856. UpperAnsiStringProc:=@UpperAnsiString;
  857. LowerAnsiStringProc:=@LowerAnsiString;
  858. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  859. CompareTextAnsiStringProc:=@AnsiCompareText;
  860. StrCompAnsiStringProc:=@StrCompAnsi;
  861. StrICompAnsiStringProc:=@AnsiStrIComp;
  862. StrLCompAnsiStringProc:=@AnsiStrLComp;
  863. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  864. StrLowerAnsiStringProc:=@AnsiStrLower;
  865. StrUpperAnsiStringProc:=@AnsiStrUpper;
  866. ThreadInitProc:=@InitThread;
  867. ThreadFiniProc:=@FiniThread;
  868. { Unicode }
  869. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  870. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  871. UpperUnicodeStringProc:=@UpperWideString;
  872. LowerUnicodeStringProc:=@LowerWideString;
  873. CompareUnicodeStringProc:=@CompareWideString;
  874. CompareTextUnicodeStringProc:=@CompareTextWideString;
  875. { CodePage }
  876. GetStandardCodePageProc:=@GetStandardCodePage;
  877. end;
  878. SetUnicodeStringManager(CWideStringManager);
  879. end;
  880. var
  881. iconvlib:TLibHandle;
  882. initialization
  883. SetCWideStringManager;
  884. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  885. { with the information from the environment variables according to POSIX }
  886. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  887. setlocale(LC_ALL,'');
  888. { load iconvctl function }
  889. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  890. if iconvlib<>0 then
  891. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  892. { set the DefaultSystemCodePage }
  893. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  894. {$ifdef FPC_HAS_CPSTRING}
  895. SetStdIOCodePages;
  896. {$endif FPC_HAS_CPSTRING}
  897. { init conversion tables for main program }
  898. InitThread;
  899. finalization
  900. { fini conversion tables for main program }
  901. FiniThread;
  902. { unload iconv library }
  903. if iconvlib<>0 then
  904. FreeLibrary(iconvlib);
  905. end.