cwstring.pp 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(linux) or defined(solaris)}
  22. {$define iconv_is_in_libc}
  23. {$endif}
  24. {$ifdef netbsd}
  25. {$ifdef TEST_ICONV_LIBC}
  26. {$define iconv_is_in_libc}
  27. {$endif}
  28. {$endif}
  29. {$ifndef iconv_is_in_libc}
  30. {$if defined(haiku)}
  31. {$linklib textencoding}
  32. {$else}
  33. {$linklib iconv}
  34. {$endif}
  35. {$define useiconv}
  36. {$endif not iconv_is_in_libc}
  37. {$i rtldefs.inc}
  38. Uses
  39. BaseUnix,
  40. ctypes,
  41. unix,
  42. unixtype,
  43. initc,
  44. dynlibs,
  45. unixcp;
  46. Const
  47. {$ifndef useiconv}
  48. libiconvname='c'; // is in libc under Linux.
  49. libprefix='lib';
  50. {$else}
  51. {$ifdef haiku}
  52. libiconvname='textencoding'; // is in libtextencoding under Haiku
  53. libprefix='lib';
  54. {$else}
  55. {$if defined(darwin) or defined(aix)}
  56. libiconvname='libiconv';
  57. libprefix='';
  58. {$else}
  59. libiconvname='iconv';
  60. libprefix='lib';
  61. {$endif}
  62. {$endif}
  63. {$endif}
  64. { helper functions from libc }
  65. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  66. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  67. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  68. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  69. {$ifdef netbsd}
  70. { NetBSD has a new setlocale function defined in /usr/include/locale.h
  71. that should be used }
  72. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name '__setlocale_mb_len_max_32';
  73. {$else}
  74. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  75. {$endif}
  76. {$ifndef beos}
  77. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  78. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  79. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  80. {$else beos}
  81. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  82. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  83. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  84. {$endif beos}
  85. const
  86. {$if defined(linux)}
  87. __LC_CTYPE = 0;
  88. LC_ALL = 6;
  89. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  90. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  91. CODESET = _NL_CTYPE_CODESET_NAME;
  92. {$elseif defined(darwin)}
  93. CODESET = 0;
  94. LC_ALL = 0;
  95. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  96. __LC_CTYPE = 0;
  97. LC_ALL = 0;
  98. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  99. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  100. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  101. {$elseif defined(solaris)}
  102. {$define ACCEPT_646}
  103. CODESET=49;
  104. LC_ALL = 6;
  105. {$elseif defined(beos)}
  106. {$warning check correct value for BeOS}
  107. CODESET=49;
  108. {$ifdef haiku}
  109. LC_ALL = 0; // Checked for Haiku
  110. {$else}
  111. LC_ALL = 6; // Checked for BeOS
  112. {$endif}
  113. ESysEILSEQ = EILSEQ;
  114. {$elseif defined(OpenBSD)}
  115. {$define ACCEPT_646}
  116. CODESET = 51;
  117. LC_ALL = 0;
  118. {$elseif defined(NetBSD)}
  119. {$define ACCEPT_646}
  120. CODESET = 51;
  121. LC_ALL = 0;
  122. {$elseif defined(aix)}
  123. CODESET = 49;
  124. LC_ALL = -1;
  125. {$elseif defined(dragonfly)}
  126. CODESET = 0;
  127. LC_ALL = 0;
  128. __LC_CTYPE = 0;
  129. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  130. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  131. {$else not aix}
  132. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  133. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  134. {$endif}
  135. { unicode encoding name }
  136. {$ifdef FPC_LITTLE_ENDIAN}
  137. unicode_encoding2 = 'UTF-16LE';
  138. unicode_encoding4 = 'UCS-4LE';
  139. {$else FPC_LITTLE_ENDIAN}
  140. {$ifdef AIX}
  141. unicode_encoding2 = 'UTF-16';
  142. {$else AIX}
  143. unicode_encoding2 = 'UTF-16BE';
  144. unicode_encoding4 = 'UCS-4BE';
  145. {$endif AIX}
  146. {$endif FPC_LITTLE_ENDIAN}
  147. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  148. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  149. { that's a libc macro mapped to internal functions/variables }
  150. { and thus not a stable external API on systems where libc }
  151. { breaks backwards compatibility every now and then }
  152. MB_CUR_MAX = 10;
  153. { Requests for iconvctl }
  154. ICONV_TRIVIALP = 0; // int *argument
  155. ICONV_GET_TRANSLITERATE = 1; // int *argument
  156. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  157. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  158. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  159. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  160. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  161. type
  162. piconv_t = ^iconv_t;
  163. iconv_t = pointer;
  164. nl_item = cint;
  165. {$ifdef haiku}
  166. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'root' name 'nl_langinfo';
  167. {$else}
  168. {$ifndef beos}
  169. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  170. {$endif}
  171. {$endif}
  172. {$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
  173. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  174. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  175. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  176. const
  177. iconvctlname='iconvctl';
  178. {$else}
  179. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  180. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  181. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  182. const
  183. iconvctlname='libiconvctl';
  184. {$endif}
  185. var
  186. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  187. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  188. threadvar
  189. iconv_ansi2wide,
  190. iconv_wide2ansi : iconv_t;
  191. { since we cache the iconv_t converters, we have to do the same
  192. for the DefaultSystemCodePage variable since if it changes, we
  193. have to re-initialize the converters too. We can't do that via
  194. a callback in the widestring manager because DefaultSystemCodePage
  195. is not a threadvar and we can't automatically change this in all
  196. threads }
  197. current_DefaultSystemCodePage: TSystemCodePage;
  198. procedure InitThread;
  199. var
  200. transliterate: cint;
  201. iconvindex: longint;
  202. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  203. iconvname: rawbytestring;
  204. {$endif}
  205. begin
  206. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  207. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  208. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  209. if iconvindex<>-1 then
  210. iconvname:=UnixCpMap[iconvindex].name
  211. else
  212. { default to UTF-8 on Unix platforms }
  213. iconvname:='UTF-8';
  214. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  215. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  216. {$else}
  217. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  218. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  219. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  220. {$endif}
  221. if assigned(iconvctl) and
  222. (iconv_wide2ansi<>iconv_t(-1)) then
  223. begin
  224. transliterate:=1;
  225. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  226. end;
  227. end;
  228. procedure FiniThread;
  229. begin
  230. if (iconv_wide2ansi <> iconv_t(-1)) then
  231. iconv_close(iconv_wide2ansi);
  232. if (iconv_ansi2wide <> iconv_t(-1)) then
  233. iconv_close(iconv_ansi2wide);
  234. end;
  235. {$if defined(beos) and not defined(haiku)}
  236. function nl_langinfo(__item:nl_item):pchar;
  237. begin
  238. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  239. // Now implement the minimum required to correctly initialize WideString support
  240. case __item of
  241. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  242. else
  243. begin
  244. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  245. Result := '';
  246. end;
  247. end;
  248. end;
  249. {$endif}
  250. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
  251. var
  252. iconvindex: longint;
  253. begin
  254. { TODO: add caching (then we also don't need separate code for
  255. the default system page and other ones)
  256. -- typecasting an ansistring function result to pchar is
  257. unsafe normally, but these are constant strings -> no
  258. problem }
  259. open_iconv_for_cps:=iconv_t(-1);
  260. iconvindex:=GetCodepageData(cp);
  261. if iconvindex=-1 then
  262. exit;
  263. repeat
  264. if cp_is_from then
  265. open_iconv_for_cps:=iconv_open(otherencoding,pchar(UnixCpMap[iconvindex].name))
  266. else
  267. open_iconv_for_cps:=iconv_open(pchar(UnixCpMap[iconvindex].name),otherencoding);
  268. inc(iconvindex);
  269. until (open_iconv_for_cps<>iconv_t(-1)) or
  270. (iconvindex>high(UnixCpMap)) or
  271. (UnixCpMap[iconvindex].cp<>cp);
  272. end;
  273. {$ifdef aix}
  274. {$i cwstraix.inc}
  275. {$endif aix}
  276. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  277. var
  278. outlength,
  279. outoffset,
  280. srclen,
  281. outleft : size_t;
  282. use_iconv: iconv_t;
  283. srcpos : pwidechar;
  284. destpos: pchar;
  285. mynil : pchar;
  286. my0 : size_t;
  287. err : longint;
  288. transliterate: cint;
  289. free_iconv: boolean;
  290. {$ifdef aix}
  291. intermediate: rawbytestring;
  292. {$endif aix}
  293. begin
  294. {$ifdef aix}
  295. { AIX libiconv does not support converting cp866 to anything else except
  296. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  297. if cp=866 then
  298. begin
  299. Wide2AnsiMove(source,intermediate,28595,len);
  300. if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
  301. exit;
  302. end;
  303. {$endif aix}
  304. if (cp=DefaultSystemCodePage) then
  305. begin
  306. { update iconv converter in case the DefaultSystemCodePage has been
  307. changed }
  308. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  309. begin
  310. FiniThread;
  311. InitThread;
  312. end;
  313. use_iconv:=iconv_wide2ansi;
  314. free_iconv:=false;
  315. end
  316. else
  317. begin
  318. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  319. if (use_iconv<>iconv_t(-1)) and
  320. assigned(iconvctl) then
  321. begin
  322. transliterate:=1;
  323. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  324. end;
  325. free_iconv:=true;
  326. end;
  327. { unsupported encoding -> default move }
  328. if use_iconv=iconv_t(-1) then
  329. begin
  330. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  331. exit;
  332. end;
  333. mynil:=nil;
  334. my0:=0;
  335. { rought estimation }
  336. setlength(dest,len*3);
  337. outlength:=len*3;
  338. srclen:=len*2;
  339. srcpos:=source;
  340. destpos:=pchar(dest);
  341. outleft:=outlength;
  342. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  343. begin
  344. err:=fpgetCerrno;
  345. case err of
  346. { last character is incomplete sequence }
  347. ESysEINVAL,
  348. { incomplete sequence in the middle }
  349. ESysEILSEQ:
  350. begin
  351. { skip and set to '?' }
  352. inc(srcpos);
  353. dec(srclen,2);
  354. destpos^:='?';
  355. inc(destpos);
  356. dec(outleft);
  357. { reset }
  358. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  359. if err=ESysEINVAL then
  360. break;
  361. end;
  362. ESysE2BIG:
  363. begin
  364. outoffset:=destpos-pchar(dest);
  365. { extend }
  366. setlength(dest,outlength+len*3);
  367. inc(outleft,len*3);
  368. inc(outlength,len*3);
  369. { string could have been moved }
  370. destpos:=pchar(dest)+outoffset;
  371. end;
  372. else
  373. runerror(231);
  374. end;
  375. end;
  376. // truncate string
  377. setlength(dest,length(dest)-outleft);
  378. SetCodePage(dest,cp,false);
  379. if free_iconv then
  380. iconv_close(use_iconv);
  381. end;
  382. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  383. var
  384. outlength,
  385. outoffset,
  386. outleft : size_t;
  387. use_iconv: iconv_t;
  388. srcpos,
  389. destpos: pchar;
  390. mynil : pchar;
  391. my0 : size_t;
  392. err: cint;
  393. iconvindex: longint;
  394. free_iconv: boolean;
  395. {$ifdef aix}
  396. intermediate: rawbytestring;
  397. {$endif aix}
  398. begin
  399. {$ifdef aix}
  400. { AIX libiconv does not support converting cp866 to anything else except
  401. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  402. if cp=866 then
  403. begin
  404. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  405. source:=pchar(intermediate);
  406. end;
  407. {$endif aix}
  408. if (cp=DefaultSystemCodePage) then
  409. begin
  410. { update iconv converter in case the DefaultSystemCodePage has been
  411. changed }
  412. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  413. begin
  414. FiniThread;
  415. InitThread;
  416. end;
  417. use_iconv:=iconv_ansi2wide;
  418. free_iconv:=false;
  419. end
  420. else
  421. begin
  422. { TODO: add caching (then we also don't need separate code for
  423. the default system page and other ones)
  424. -- typecasting an ansistring function result to pchar is
  425. unsafe normally, but these are constant strings -> no
  426. problem }
  427. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  428. free_iconv:=true;
  429. end;
  430. { unsupported encoding -> default move }
  431. if use_iconv=iconv_t(-1) then
  432. begin
  433. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  434. exit;
  435. end;
  436. mynil:=nil;
  437. my0:=0;
  438. // extra space
  439. outlength:=len+1;
  440. setlength(dest,outlength);
  441. srcpos:=source;
  442. destpos:=pchar(dest);
  443. outleft:=outlength*2;
  444. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  445. begin
  446. err:=fpgetCerrno;
  447. case err of
  448. ESysEINVAL,
  449. ESysEILSEQ:
  450. begin
  451. { skip and set to '?' }
  452. inc(srcpos);
  453. dec(len);
  454. pwidechar(destpos)^:='?';
  455. inc(destpos,2);
  456. dec(outleft,2);
  457. { reset }
  458. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  459. if err=ESysEINVAL then
  460. break;
  461. end;
  462. ESysE2BIG:
  463. begin
  464. outoffset:=destpos-pchar(dest);
  465. { extend }
  466. setlength(dest,outlength+len);
  467. inc(outleft,len*2);
  468. inc(outlength,len);
  469. { string could have been moved }
  470. destpos:=pchar(dest)+outoffset;
  471. end;
  472. else
  473. runerror(231);
  474. end;
  475. end;
  476. // truncate string
  477. setlength(dest,length(dest)-outleft div 2);
  478. if free_iconv then
  479. iconv_close(use_iconv);
  480. end;
  481. function LowerWideString(const s : WideString) : WideString;
  482. var
  483. i : SizeInt;
  484. begin
  485. SetLength(result,length(s));
  486. for i:=0 to length(s)-1 do
  487. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  488. end;
  489. function UpperWideString(const s : WideString) : WideString;
  490. var
  491. i : SizeInt;
  492. begin
  493. SetLength(result,length(s));
  494. for i:=0 to length(s)-1 do
  495. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  496. end;
  497. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  498. begin
  499. if (len>length(s)) then
  500. if (length(s) < 10*256) then
  501. setlength(s,length(s)+10)
  502. else
  503. setlength(s,length(s)+length(s) shr 8);
  504. end;
  505. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  506. begin
  507. EnsureAnsiLen(s,index);
  508. pchar(@s[index])^:=c;
  509. inc(index);
  510. end;
  511. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  512. {$ifndef beos}
  513. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  514. {$else not beos}
  515. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  516. {$endif beos}
  517. var
  518. p : pchar;
  519. mblen : size_t;
  520. begin
  521. { we know that s is unique -> avoid uniquestring calls}
  522. p:=@s[index];
  523. if (nc<=127) then
  524. ConcatCharToAnsiStr(char(nc),s,index)
  525. else
  526. begin
  527. EnsureAnsiLen(s,index+MB_CUR_MAX);
  528. {$ifndef beos}
  529. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  530. {$else not beos}
  531. mblen:=wctomb(p,wchar_t(nc));
  532. {$endif not beos}
  533. if (mblen<>size_t(-1)) then
  534. inc(index,mblen)
  535. else
  536. begin
  537. { invalid wide char }
  538. p^:='?';
  539. inc(index);
  540. end;
  541. end;
  542. end;
  543. function LowerAnsiString(const s : AnsiString) : AnsiString;
  544. var
  545. i, slen,
  546. resindex : SizeInt;
  547. mblen : size_t;
  548. {$ifndef beos}
  549. ombstate,
  550. nmbstate : mbstate_t;
  551. {$endif beos}
  552. wc : wchar_t;
  553. begin
  554. {$ifndef beos}
  555. fillchar(ombstate,sizeof(ombstate),0);
  556. fillchar(nmbstate,sizeof(nmbstate),0);
  557. {$endif beos}
  558. slen:=length(s);
  559. SetLength(result,slen+10);
  560. i:=1;
  561. resindex:=1;
  562. while (i<=slen) do
  563. begin
  564. if (s[i]<=#127) then
  565. begin
  566. wc:=wchar_t(s[i]);
  567. mblen:= 1;
  568. end
  569. else
  570. {$ifndef beos}
  571. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  572. {$else not beos}
  573. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  574. {$endif not beos}
  575. case mblen of
  576. size_t(-2):
  577. begin
  578. { partial invalid character, copy literally }
  579. while (i<=slen) do
  580. begin
  581. ConcatCharToAnsiStr(s[i],result,resindex);
  582. inc(i);
  583. end;
  584. end;
  585. size_t(-1), 0:
  586. begin
  587. { invalid or null character }
  588. ConcatCharToAnsiStr(s[i],result,resindex);
  589. inc(i);
  590. end;
  591. else
  592. begin
  593. { a valid sequence }
  594. { even if mblen = 1, the lowercase version may have a }
  595. { different length }
  596. { We can't do anything special if wchar_t is 16 bit... }
  597. {$ifndef beos}
  598. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  599. {$else not beos}
  600. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  601. {$endif not beos}
  602. inc(i,mblen);
  603. end;
  604. end;
  605. end;
  606. SetLength(result,resindex-1);
  607. end;
  608. function UpperAnsiString(const s : AnsiString) : AnsiString;
  609. var
  610. i, slen,
  611. resindex : SizeInt;
  612. mblen : size_t;
  613. {$ifndef beos}
  614. ombstate,
  615. nmbstate : mbstate_t;
  616. {$endif beos}
  617. wc : wchar_t;
  618. begin
  619. {$ifndef beos}
  620. fillchar(ombstate,sizeof(ombstate),0);
  621. fillchar(nmbstate,sizeof(nmbstate),0);
  622. {$endif beos}
  623. slen:=length(s);
  624. SetLength(result,slen+10);
  625. i:=1;
  626. resindex:=1;
  627. while (i<=slen) do
  628. begin
  629. if (s[i]<=#127) then
  630. begin
  631. wc:=wchar_t(s[i]);
  632. mblen:= 1;
  633. end
  634. else
  635. {$ifndef beos}
  636. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  637. {$else not beos}
  638. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  639. {$endif beos}
  640. case mblen of
  641. size_t(-2):
  642. begin
  643. { partial invalid character, copy literally }
  644. while (i<=slen) do
  645. begin
  646. ConcatCharToAnsiStr(s[i],result,resindex);
  647. inc(i);
  648. end;
  649. end;
  650. size_t(-1), 0:
  651. begin
  652. { invalid or null character }
  653. ConcatCharToAnsiStr(s[i],result,resindex);
  654. inc(i);
  655. end;
  656. else
  657. begin
  658. { a valid sequence }
  659. { even if mblen = 1, the uppercase version may have a }
  660. { different length }
  661. { We can't do anything special if wchar_t is 16 bit... }
  662. {$ifndef beos}
  663. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  664. {$else not beos}
  665. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  666. {$endif not beos}
  667. inc(i,mblen);
  668. end;
  669. end;
  670. end;
  671. SetLength(result,resindex-1);
  672. end;
  673. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  674. var
  675. i, slen,
  676. destindex : SizeInt;
  677. uch : UCS4Char;
  678. begin
  679. slen:=length(s);
  680. setlength(result,slen+1);
  681. i:=1;
  682. destindex:=0;
  683. while (i<=slen) do
  684. begin
  685. uch:=UCS4Char(s[i]);
  686. if (uch=0) then
  687. result[destindex]:=32
  688. else if (uch<=$d7ff) or (uch>=$e000) then
  689. result[destindex]:=uch
  690. else if (uch<=$dbff) and
  691. (i<slen) and
  692. (s[i+1]>=#$dc00) and
  693. (s[i+1]<=#$dfff) then
  694. begin
  695. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  696. inc(i);
  697. end
  698. else { invalid surrogate pair }
  699. result[destindex]:=uch;
  700. inc(i);
  701. inc(destindex);
  702. end;
  703. result[destindex]:=UCS4Char(0);
  704. { Trimming length in this particular case is just a waste of time,
  705. because result will be interpreted as null-terminated and discarded
  706. almost immediately }
  707. end;
  708. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  709. {$if not(defined (aix) and defined(cpupowerpc32))}
  710. var
  711. hs1,hs2 : UCS4String;
  712. begin
  713. { wcscoll interprets null chars as end-of-string -> filter out }
  714. hs1:=WideStringToUCS4StringNoNulls(s1);
  715. hs2:=WideStringToUCS4StringNoNulls(s2);
  716. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  717. end;
  718. {$else}
  719. { AIX/PPC32 has a 16 bit wchar_t }
  720. var
  721. i, len: longint;
  722. hs1, hs2: array of widechar;
  723. begin
  724. len:=length(s1);
  725. setlength(hs1,len+1);
  726. for i:=1 to len do
  727. if s1[i]<>#0 then
  728. hs1[i-1]:=s1[i]
  729. else
  730. hs1[i-1]:=#32;
  731. hs1[len]:=#0;
  732. len:=length(s2);
  733. setlength(hs2,len+1);
  734. for i:=1 to len do
  735. if s2[i]<>#0 then
  736. hs2[i-1]:=s2[i]
  737. else
  738. hs2[i-1]:=#32;
  739. hs2[len]:=#0;
  740. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  741. end;
  742. {$endif}
  743. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  744. begin
  745. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  746. end;
  747. { return value: number of code points in the string. Whenever an invalid
  748. code point is encountered, all characters part of this invalid code point
  749. are considered to form one "character" and the next character is
  750. considered to be the start of a new (possibly also invalid) code point }
  751. function CharLengthPChar(const Str: PChar): PtrInt;
  752. var
  753. nextlen: ptrint;
  754. s: pchar;
  755. {$ifndef beos}
  756. mbstate: mbstate_t;
  757. {$endif not beos}
  758. begin
  759. result:=0;
  760. s:=str;
  761. {$ifndef beos}
  762. fillchar(mbstate,sizeof(mbstate),0);
  763. {$endif not beos}
  764. repeat
  765. {$ifdef beos}
  766. nextlen:=ptrint(mblen(s,MB_CUR_MAX));
  767. {$else beos}
  768. nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate));
  769. {$endif beos}
  770. { skip invalid/incomplete sequences }
  771. if (nextlen<0) then
  772. nextlen:=1;
  773. inc(result,1);
  774. inc(s,nextlen);
  775. until (nextlen=0);
  776. end;
  777. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  778. {$ifndef beos}
  779. var
  780. mbstate: mbstate_t;
  781. {$endif not beos}
  782. begin
  783. {$ifdef beos}
  784. result:=ptrint(mblen(str,maxlookahead));
  785. {$else beos}
  786. fillchar(mbstate,sizeof(mbstate),0);
  787. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  788. { mbrlen can also return -2 for "incomplete but potially valid character
  789. and data has been processed" }
  790. if result<0 then
  791. result:=-1;
  792. {$endif beos}
  793. end;
  794. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  795. var
  796. a,b: pchar;
  797. i: PtrInt;
  798. begin
  799. if not(canmodifys1) then
  800. getmem(a,len1+1)
  801. else
  802. a:=s1;
  803. for i:=0 to len1-1 do
  804. if s1[i]<>#0 then
  805. a[i]:=s1[i]
  806. else
  807. a[i]:=#32;
  808. a[len1]:=#0;
  809. if not(canmodifys2) then
  810. getmem(b,len2+1)
  811. else
  812. b:=s2;
  813. for i:=0 to len2-1 do
  814. if s2[i]<>#0 then
  815. b[i]:=s2[i]
  816. else
  817. b[i]:=#32;
  818. b[len2]:=#0;
  819. result:=strcoll(a,b);
  820. if not(canmodifys1) then
  821. freemem(a);
  822. if not(canmodifys2) then
  823. freemem(b);
  824. end;
  825. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  826. begin
  827. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  828. end;
  829. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  830. begin
  831. result:=strcoll(s1,s2);
  832. end;
  833. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  834. var
  835. a, b: AnsiString;
  836. begin
  837. a:=UpperAnsistring(s1);
  838. b:=UpperAnsistring(s2);
  839. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  840. end;
  841. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  842. begin
  843. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  844. end;
  845. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  846. var
  847. a, b: pchar;
  848. begin
  849. if (maxlen=0) then
  850. exit(0);
  851. if (s1[maxlen]<>#0) then
  852. begin
  853. getmem(a,maxlen+1);
  854. move(s1^,a^,maxlen);
  855. a[maxlen]:=#0;
  856. end
  857. else
  858. a:=s1;
  859. if (s2[maxlen]<>#0) then
  860. begin
  861. getmem(b,maxlen+1);
  862. move(s2^,b^,maxlen);
  863. b[maxlen]:=#0;
  864. end
  865. else
  866. b:=s2;
  867. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  868. if (a<>s1) then
  869. freemem(a);
  870. if (b<>s2) then
  871. freemem(b);
  872. end;
  873. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  874. var
  875. a, b: ansistring;
  876. begin
  877. if (maxlen=0) then
  878. exit(0);
  879. setlength(a,maxlen);
  880. move(s1^,a[1],maxlen);
  881. setlength(b,maxlen);
  882. move(s2^,b[1],maxlen);
  883. result:=AnsiCompareText(a,b);
  884. end;
  885. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  886. var
  887. newlen: sizeint;
  888. begin
  889. newlen:=length(s);
  890. if newlen>strlen(orgp) then
  891. fpc_rangeerror;
  892. p:=orgp;
  893. if (newlen>0) then
  894. move(s[1],p[0],newlen);
  895. p[newlen]:=#0;
  896. end;
  897. function AnsiStrLower(Str: PChar): PChar;
  898. var
  899. temp: ansistring;
  900. begin
  901. temp:=loweransistring(str);
  902. ansi2pchar(temp,str,result);
  903. end;
  904. function AnsiStrUpper(Str: PChar): PChar;
  905. var
  906. temp: ansistring;
  907. begin
  908. temp:=upperansistring(str);
  909. ansi2pchar(temp,str,result);
  910. end;
  911. function envvarset(const varname: pchar): boolean;
  912. var
  913. varval: pchar;
  914. begin
  915. varval:=fpgetenv(varname);
  916. result:=
  917. assigned(varval) and
  918. (varval[0]<>#0);
  919. end;
  920. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  921. var
  922. langinfo: pchar;
  923. begin
  924. {$ifdef FPCRTL_FILESYSTEM_UTF8}
  925. if stdcp=scpFileSystemSingleByte then
  926. begin
  927. result:=CP_UTF8;
  928. exit;
  929. end;
  930. {$endif}
  931. { if none of the relevant LC_* environment variables are set, fall back to
  932. UTF-8 (this happens under some versions of OS X for GUI applications, which
  933. otherwise get CP_ASCII) }
  934. if envvarset('LC_ALL') or
  935. envvarset('LC_CTYPE') or
  936. envvarset('LANG') then
  937. begin
  938. langinfo:=nl_langinfo(CODESET);
  939. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  940. that causes it to return an empty string of UTF-8 locales
  941. -> patch up (and in general, UTF-8 is a good default on
  942. Unix platforms) }
  943. if not assigned(langinfo) or
  944. (langinfo^=#0) then
  945. langinfo:='UTF-8';
  946. Result:=GetCodepageByName(ansistring(langinfo));
  947. end
  948. else
  949. Result:=unixcp.GetSystemCodepage;
  950. end;
  951. {$ifdef FPC_HAS_CPSTRING}
  952. procedure SetStdIOCodePage(var T: Text); inline;
  953. begin
  954. case TextRec(T).Mode of
  955. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  956. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  957. end;
  958. end;
  959. procedure SetStdIOCodePages; inline;
  960. begin
  961. SetStdIOCodePage(Input);
  962. SetStdIOCodePage(Output);
  963. SetStdIOCodePage(ErrOutput);
  964. SetStdIOCodePage(StdOut);
  965. SetStdIOCodePage(StdErr);
  966. end;
  967. {$endif FPC_HAS_CPSTRING}
  968. var
  969. OrgWideStringManager: TUnicodeStringManager;
  970. Procedure SetCWideStringManager;
  971. Var
  972. CWideStringManager : TUnicodeStringManager;
  973. begin
  974. GetUnicodeStringManager(OrgWideStringManager);
  975. CWideStringManager:=OrgWideStringManager;
  976. With CWideStringManager do
  977. begin
  978. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  979. Ansi2WideMoveProc:=@Ansi2WideMove;
  980. UpperWideStringProc:=@UpperWideString;
  981. LowerWideStringProc:=@LowerWideString;
  982. CompareWideStringProc:=@CompareWideString;
  983. CompareTextWideStringProc:=@CompareTextWideString;
  984. CharLengthPCharProc:=@CharLengthPChar;
  985. CodePointLengthProc:=@CodePointLength;
  986. UpperAnsiStringProc:=@UpperAnsiString;
  987. LowerAnsiStringProc:=@LowerAnsiString;
  988. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  989. CompareTextAnsiStringProc:=@AnsiCompareText;
  990. StrCompAnsiStringProc:=@StrCompAnsi;
  991. StrICompAnsiStringProc:=@AnsiStrIComp;
  992. StrLCompAnsiStringProc:=@AnsiStrLComp;
  993. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  994. StrLowerAnsiStringProc:=@AnsiStrLower;
  995. StrUpperAnsiStringProc:=@AnsiStrUpper;
  996. ThreadInitProc:=@InitThread;
  997. ThreadFiniProc:=@FiniThread;
  998. { Unicode }
  999. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  1000. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  1001. UpperUnicodeStringProc:=@UpperWideString;
  1002. LowerUnicodeStringProc:=@LowerWideString;
  1003. CompareUnicodeStringProc:=@CompareWideString;
  1004. CompareTextUnicodeStringProc:=@CompareTextWideString;
  1005. { CodePage }
  1006. GetStandardCodePageProc:=@GetStandardCodePage;
  1007. end;
  1008. SetUnicodeStringManager(CWideStringManager);
  1009. end;
  1010. var
  1011. iconvlib:TLibHandle;
  1012. initialization
  1013. SetCWideStringManager;
  1014. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  1015. { with the information from the environment variables according to POSIX }
  1016. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  1017. setlocale(LC_ALL,'');
  1018. { load iconvctl function }
  1019. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  1020. if iconvlib<>0 then
  1021. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  1022. { set the DefaultSystemCodePage }
  1023. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  1024. DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
  1025. DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
  1026. {$ifdef FPC_HAS_CPSTRING}
  1027. SetStdIOCodePages;
  1028. {$endif FPC_HAS_CPSTRING}
  1029. { init conversion tables for main program }
  1030. InitThread;
  1031. finalization
  1032. { fini conversion tables for main program }
  1033. FiniThread;
  1034. { unload iconv library }
  1035. if iconvlib<>0 then
  1036. FreeLibrary(iconvlib);
  1037. { restore previous (probably default) widestring manager so that subsequent calls
  1038. into the widestring manager won't trigger the finalized functionality }
  1039. SetWideStringManager(OrgWideStringManager);
  1040. end.