cwstring.pp 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$modeswitch unicodestrings-}
  14. {$h-}
  15. {$inline on}
  16. {$implicitexceptions off}
  17. unit cwstring;
  18. interface
  19. procedure SetCWidestringManager;
  20. implementation
  21. {$linklib c}
  22. // Linux (and maybe glibc platforms in general), have iconv in glibc.
  23. {$if defined(linux) or defined(solaris)}
  24. {$define iconv_is_in_libc}
  25. {$endif}
  26. {$ifdef netbsd}
  27. {$ifndef DISABLE_ICONV_LIBC}
  28. {$define iconv_is_in_libc}
  29. {$endif}
  30. {$endif}
  31. {$ifdef dragonfly}
  32. { iconv function are included inside libc for DragonFly version 6.3 }
  33. { See https://man.dragonflybsd.org/?command=iconv&section=3 }
  34. {$ifndef DISABLE_ICONV_LIBC}
  35. {$define iconv_is_in_libc}
  36. {$endif}
  37. {$endif}
  38. { Modern FreeBSD releases also claim to have iconv
  39. function inside libc, at least
  40. this is stated explicitly in iconv(3) man
  41. of FreeBSD from 10.0 release
  42. PM 2023-05-05 }
  43. {$ifdef freebsd}
  44. {$ifndef DISABLE_ICONV_LIBC}
  45. {$define iconv_is_in_libc}
  46. {$endif}
  47. {$endif}
  48. {$ifndef iconv_is_in_libc}
  49. {$if defined(haiku)}
  50. {$linklib textencoding}
  51. {$else}
  52. {$linklib iconv}
  53. {$endif}
  54. {$define useiconv}
  55. {$endif not iconv_is_in_libc}
  56. {$i rtldefs.inc}
  57. Uses
  58. BaseUnix,
  59. ctypes,
  60. unix,
  61. unixtype,
  62. initc,
  63. dynlibs,
  64. unixcp;
  65. Const
  66. {$ifndef useiconv}
  67. libiconvname='c'; // is in libc under Linux.
  68. libprefix='lib';
  69. {$else}
  70. {$ifdef haiku}
  71. libiconvname='textencoding'; // is in libtextencoding under Haiku
  72. libprefix='lib';
  73. {$else}
  74. {$if defined(darwin) or defined(aix)}
  75. libiconvname='libiconv';
  76. libprefix='';
  77. {$else}
  78. libiconvname='iconv';
  79. libprefix='lib';
  80. {$endif}
  81. {$endif}
  82. {$endif}
  83. { helper functions from libc }
  84. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  85. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  86. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  87. function strcoll (__s1:PAnsiChar; __s2:PAnsiChar):cint;cdecl;external clib name 'strcoll';
  88. {$ifdef netbsd}
  89. { NetBSD has a new setlocale function defined in /usr/include/locale.h
  90. that should be used }
  91. function setlocale(category: cint; locale: PAnsiChar): PAnsiChar; cdecl; external clib name '__setlocale50';
  92. {$else}
  93. function setlocale(category: cint; locale: PAnsiChar): PAnsiChar; cdecl; external clib name 'setlocale';
  94. {$endif}
  95. {$if not(defined(beos) and not defined(haiku))}
  96. function mbrtowc(pwc: pwchar_t; const s: PAnsiChar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  97. function wcrtomb(s: PAnsiChar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  98. function mbrlen(const s: PAnsiChar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  99. {$else beos}
  100. function mbtowc(pwc: pwchar_t; const s: PAnsiChar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  101. function wctomb(s: PAnsiChar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  102. function mblen(const s: PAnsiChar; n: size_t): size_t; cdecl; external clib name 'mblen';
  103. {$endif beos}
  104. const
  105. {$if defined(linux)}
  106. __LC_CTYPE = 0;
  107. LC_ALL = 6;
  108. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  109. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  110. CODESET = _NL_CTYPE_CODESET_NAME;
  111. {$elseif defined(darwin)}
  112. CODESET = 0;
  113. LC_ALL = 0;
  114. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  115. __LC_CTYPE = 0;
  116. LC_ALL = 0;
  117. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  118. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  119. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  120. {$elseif defined(solaris)}
  121. {$define ACCEPT_646}
  122. CODESET=49;
  123. LC_ALL = 6;
  124. {$elseif defined(beos)}
  125. {$ifdef haiku}
  126. CODESET= 0; // Checked for Haiku
  127. LC_ALL = 0; // Checked for Haiku
  128. {$else}
  129. {$warning check correct value for BeOS}
  130. CODESET=49;
  131. LC_ALL = 6; // Checked for BeOS
  132. {$endif}
  133. ESysEILSEQ = EILSEQ;
  134. {$elseif defined(OpenBSD)}
  135. {$define ACCEPT_646}
  136. CODESET = 51;
  137. LC_ALL = 0;
  138. {$elseif defined(NetBSD)}
  139. {$define ACCEPT_646}
  140. CODESET = 51;
  141. LC_ALL = 0;
  142. {$elseif defined(aix)}
  143. CODESET = 49;
  144. LC_ALL = -1;
  145. {$elseif defined(dragonfly)}
  146. CODESET = 0;
  147. LC_ALL = 0;
  148. __LC_CTYPE = 0;
  149. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  150. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  151. {$else not aix}
  152. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  153. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  154. {$endif}
  155. { unicode encoding name }
  156. {$ifdef FPC_LITTLE_ENDIAN}
  157. unicode_encoding2 = 'UTF-16LE';
  158. unicode_encoding4 = 'UCS-4LE';
  159. {$else FPC_LITTLE_ENDIAN}
  160. {$ifdef AIX}
  161. unicode_encoding2 = 'UTF-16';
  162. {$else AIX}
  163. unicode_encoding2 = 'UTF-16BE';
  164. unicode_encoding4 = 'UCS-4BE';
  165. {$endif AIX}
  166. {$endif FPC_LITTLE_ENDIAN}
  167. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  168. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  169. { that's a libc macro mapped to internal functions/variables }
  170. { and thus not a stable external API on systems where libc }
  171. { breaks backwards compatibility every now and then }
  172. MB_CUR_MAX = 10;
  173. { Requests for iconvctl }
  174. ICONV_TRIVIALP = 0; // int *argument
  175. ICONV_GET_TRANSLITERATE = 1; // int *argument
  176. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  177. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  178. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  179. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  180. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  181. type
  182. piconv_t = ^iconv_t;
  183. iconv_t = pointer;
  184. nl_item = cint;
  185. {$ifdef haiku}
  186. function nl_langinfo(__item:nl_item):PAnsiChar;cdecl;external 'root' name 'nl_langinfo';
  187. {$else}
  188. {$ifndef beos}
  189. function nl_langinfo(__item:nl_item):PAnsiChar;cdecl;external libiconvname name 'nl_langinfo';
  190. {$endif}
  191. {$endif}
  192. {$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
  193. function iconv_open(__tocode:PAnsiChar; __fromcode:PAnsiChar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  194. function iconv(__cd:iconv_t; __inbuf:PPAnsiChar; __inbytesleft:psize_t; __outbuf:PPAnsiChar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  195. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  196. const
  197. iconvctlname='iconvctl';
  198. {$else}
  199. function iconv_open(__tocode:PAnsiChar; __fromcode:PAnsiChar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  200. function iconv(__cd:iconv_t; __inbuf:PPAnsiChar; __inbytesleft:psize_t; __outbuf:PPAnsiChar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  201. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  202. const
  203. iconvctlname='libiconvctl';
  204. {$endif}
  205. var
  206. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  207. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  208. Const
  209. UTF8Name : ShortString = 'UTF-8';
  210. TransLitname : ShortString ='//TRANSLIT';
  211. threadvar
  212. iconv_ansi2wide,
  213. iconv_wide2ansi : iconv_t;
  214. { since we cache the iconv_t converters, we have to do the same
  215. for the DefaultSystemCodePage variable since if it changes, we
  216. have to re-initialize the converters too. We can't do that via
  217. a callback in the widestring manager because DefaultSystemCodePage
  218. is not a threadvar and we can't automatically change this in all
  219. threads }
  220. current_DefaultSystemCodePage: TSystemCodePage;
  221. procedure InitThread;
  222. var
  223. transliterate: cint;
  224. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  225. iconvindex: longint;
  226. {$endif}
  227. iconvname, toencoding: shortstring;
  228. begin
  229. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  230. {$if declared(iconvindex)}
  231. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  232. if iconvindex<>-1 then
  233. iconvname:=UnixCpMap[iconvindex].name
  234. else
  235. { default to UTF-8 on Unix platforms }
  236. iconvname:=UTF8Name;
  237. {$else}
  238. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  239. iconvname:='UTF-8';
  240. {$endif}
  241. toencoding:=iconvname;
  242. if not assigned(iconvctl) then
  243. toencoding:=toencoding+TransLitName;
  244. iconv_wide2ansi:=iconv_open(PAnsiChar(@toencoding[1]),unicode_encoding2);
  245. iconv_ansi2wide:=iconv_open(unicode_encoding2,PAnsiChar(@iconvname[1]));
  246. if assigned(iconvctl) and
  247. (iconv_wide2ansi<>iconv_t(-1)) then
  248. begin
  249. transliterate:=1;
  250. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  251. end;
  252. end;
  253. procedure FiniThread;
  254. begin
  255. if (iconv_wide2ansi <> iconv_t(-1)) then
  256. iconv_close(iconv_wide2ansi);
  257. if (iconv_ansi2wide <> iconv_t(-1)) then
  258. iconv_close(iconv_ansi2wide);
  259. end;
  260. {$if defined(beos) and not defined(haiku)}
  261. function nl_langinfo(__item:nl_item):PAnsiChar;
  262. begin
  263. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  264. // Now implement the minimum required to correctly initialize WideString support
  265. case __item of
  266. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  267. else
  268. begin
  269. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  270. Result := '';
  271. end;
  272. end;
  273. end;
  274. {$endif}
  275. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: PAnsiChar; cp_is_from: boolean): iconv_t;
  276. var
  277. iconvindex: longint;
  278. toencoding: rawbytestring;
  279. transliterate: cint;
  280. begin
  281. { TODO: add caching (then we also don't need separate code for
  282. the default system page and other ones)
  283. -- typecasting an ansistring function result to PAnsiChar is
  284. unsafe normally, but these are constant strings -> no
  285. problem }
  286. open_iconv_for_cps:=iconv_t(-1);
  287. iconvindex:=GetCodepageData(cp);
  288. if iconvindex=-1 then
  289. exit;
  290. repeat
  291. if cp_is_from then
  292. open_iconv_for_cps:=iconv_open(otherencoding,PAnsiChar(UnixCpMap[iconvindex].name))
  293. else
  294. begin
  295. toencoding:=UnixCpMap[iconvindex].name;
  296. if not assigned(iconvctl) then
  297. toencoding:=toencoding+'//TRANSLIT';
  298. open_iconv_for_cps:=iconv_open(PAnsiChar(toencoding),otherencoding);
  299. end;
  300. inc(iconvindex);
  301. until (open_iconv_for_cps<>iconv_t(-1)) or
  302. (iconvindex>high(UnixCpMap)) or
  303. (UnixCpMap[iconvindex].cp<>cp);
  304. if not cp_is_from and
  305. (open_iconv_for_cps<>iconv_t(-1)) and
  306. assigned(iconvctl) then
  307. begin
  308. transliterate:=1;
  309. iconvctl(open_iconv_for_cps,ICONV_SET_TRANSLITERATE,@transliterate);
  310. end;
  311. end;
  312. {$ifdef aix}
  313. {$i cwstraix.inc}
  314. {$endif aix}
  315. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  316. var
  317. outlength,
  318. outoffset,
  319. srclen,
  320. outleft : size_t;
  321. use_iconv: iconv_t;
  322. srcpos : pwidechar;
  323. destpos: PAnsiChar;
  324. mynil : PAnsiChar;
  325. my0 : size_t;
  326. err : longint;
  327. transliterate: cint;
  328. free_iconv: boolean;
  329. {$ifdef aix}
  330. intermediate: rawbytestring;
  331. {$endif aix}
  332. begin
  333. {$ifdef aix}
  334. { AIX libiconv does not support converting cp866 to anything else except
  335. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  336. if cp=866 then
  337. begin
  338. Wide2AnsiMove(source,intermediate,28595,len);
  339. if handle_aix_intermediate(PAnsiChar(intermediate),28595,cp,dest,len) then
  340. exit;
  341. end;
  342. {$endif aix}
  343. if (cp=DefaultSystemCodePage) then
  344. begin
  345. { update iconv converter in case the DefaultSystemCodePage has been
  346. changed }
  347. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  348. begin
  349. FiniThread;
  350. InitThread;
  351. end;
  352. use_iconv:=iconv_wide2ansi;
  353. free_iconv:=false;
  354. end
  355. else
  356. begin
  357. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  358. if (use_iconv<>iconv_t(-1)) and
  359. assigned(iconvctl) then
  360. begin
  361. transliterate:=1;
  362. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  363. end;
  364. free_iconv:=true;
  365. end;
  366. { unsupported encoding -> default move }
  367. if use_iconv=iconv_t(-1) then
  368. begin
  369. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  370. exit;
  371. end;
  372. mynil:=nil;
  373. my0:=0;
  374. { rought estimation }
  375. setlength(dest,len*3);
  376. outlength:=len*3;
  377. srclen:=len*2;
  378. srcpos:=source;
  379. destpos:=PAnsiChar(dest);
  380. outleft:=outlength;
  381. while iconv(use_iconv,PPAnsiChar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  382. begin
  383. err:=fpgetCerrno;
  384. case err of
  385. { last character is incomplete sequence }
  386. ESysEINVAL,
  387. { incomplete sequence in the middle }
  388. ESysEILSEQ:
  389. begin
  390. { skip and set to '?' }
  391. inc(srcpos);
  392. dec(srclen,2);
  393. destpos^:='?';
  394. inc(destpos);
  395. dec(outleft);
  396. { reset }
  397. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  398. if err=ESysEINVAL then
  399. break;
  400. end;
  401. ESysE2BIG:
  402. begin
  403. outoffset:=destpos-PAnsiChar(dest);
  404. { extend }
  405. setlength(dest,outlength+len*3);
  406. inc(outleft,len*3);
  407. inc(outlength,len*3);
  408. { string could have been moved }
  409. destpos:=PAnsiChar(dest)+outoffset;
  410. end;
  411. else
  412. runerror(231);
  413. end;
  414. end;
  415. // truncate string
  416. setlength(dest,length(dest)-outleft);
  417. SetCodePage(dest,cp,false);
  418. if free_iconv then
  419. iconv_close(use_iconv);
  420. end;
  421. procedure Ansi2WideMove(source:PAnsiChar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  422. var
  423. outlength,
  424. outoffset,
  425. outleft : size_t;
  426. use_iconv: iconv_t;
  427. srcpos,
  428. destpos: PAnsiChar;
  429. mynil : PAnsiChar;
  430. my0 : size_t;
  431. err: cint;
  432. iconvindex: longint;
  433. free_iconv: boolean;
  434. {$ifdef aix}
  435. intermediate: rawbytestring;
  436. {$endif aix}
  437. begin
  438. {$ifdef aix}
  439. { AIX libiconv does not support converting cp866 to anything else except
  440. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  441. if cp=866 then
  442. begin
  443. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  444. source:=PAnsiChar(intermediate);
  445. end;
  446. {$endif aix}
  447. if (cp=DefaultSystemCodePage) then
  448. begin
  449. { update iconv converter in case the DefaultSystemCodePage has been
  450. changed }
  451. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  452. begin
  453. FiniThread;
  454. InitThread;
  455. end;
  456. use_iconv:=iconv_ansi2wide;
  457. free_iconv:=false;
  458. // Using Unicode RTL, we can end up here when iconv_ansi2wide is not yet initialized
  459. if (use_iconv=Nil) then
  460. begin
  461. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  462. exit;
  463. end;
  464. end
  465. else
  466. begin
  467. { TODO: add caching (then we also don't need separate code for
  468. the default system page and other ones)
  469. -- typecasting an ansistring function result to PAnsiChar is
  470. unsafe normally, but these are constant strings -> no
  471. problem }
  472. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  473. free_iconv:=true;
  474. end;
  475. { unsupported encoding -> default move }
  476. if use_iconv=iconv_t(-1) then
  477. begin
  478. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  479. exit;
  480. end;
  481. mynil:=nil;
  482. my0:=0;
  483. // extra space
  484. outlength:=len+1;
  485. setlength(dest,outlength);
  486. srcpos:=source;
  487. destpos:=PAnsiChar(dest);
  488. outleft:=outlength*2;
  489. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  490. begin
  491. err:=fpgetCerrno;
  492. case err of
  493. ESysEINVAL,
  494. ESysEILSEQ:
  495. begin
  496. { skip and set to '?' }
  497. inc(srcpos);
  498. dec(len);
  499. pwidechar(destpos)^:='?';
  500. inc(destpos,2);
  501. dec(outleft,2);
  502. { reset }
  503. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  504. if err=ESysEINVAL then
  505. break;
  506. end;
  507. ESysE2BIG:
  508. begin
  509. outoffset:=destpos-PAnsiChar(dest);
  510. { extend }
  511. setlength(dest,outlength+len);
  512. inc(outleft,len*2);
  513. inc(outlength,len);
  514. { string could have been moved }
  515. destpos:=PAnsiChar(dest)+outoffset;
  516. end;
  517. else
  518. runerror(231);
  519. end;
  520. end;
  521. // truncate string
  522. setlength(dest,length(dest)-outleft div 2);
  523. if free_iconv then
  524. iconv_close(use_iconv);
  525. end;
  526. function LowerWideString(const s : WideString) : WideString;
  527. var
  528. i : SizeInt;
  529. begin
  530. SetLength(result,length(s));
  531. for i:=0 to length(s)-1 do
  532. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  533. end;
  534. function UpperWideString(const s : WideString) : WideString;
  535. var
  536. i : SizeInt;
  537. begin
  538. SetLength(result,length(s));
  539. for i:=0 to length(s)-1 do
  540. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  541. end;
  542. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  543. begin
  544. if (len>length(s)) then
  545. if (length(s) < 10*256) then
  546. setlength(s,length(s)+10)
  547. else
  548. setlength(s,length(s)+length(s) shr 8);
  549. end;
  550. procedure ConcatCharToAnsiStr(const c: AnsiChar; var S: AnsiString; var index: SizeInt);
  551. begin
  552. EnsureAnsiLen(s,index);
  553. PAnsiChar(@s[index])^:=c;
  554. inc(index);
  555. end;
  556. { concatenates an utf-32 Char to a ansistring. S *must* be unique when entering. }
  557. {$if not(defined(beos) and not defined(haiku))}
  558. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  559. {$else not beos}
  560. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  561. {$endif beos}
  562. var
  563. p : PAnsiChar;
  564. mblen : size_t;
  565. begin
  566. { we know that s is unique -> avoid uniquestring calls}
  567. p:=@s[index];
  568. if (nc<=127) then
  569. ConcatCharToAnsiStr(AnsiChar(nc),s,index)
  570. else
  571. begin
  572. EnsureAnsiLen(s,index+MB_CUR_MAX);
  573. {$if not(defined(beos) and not defined(haiku))}
  574. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  575. {$else not beos}
  576. mblen:=wctomb(p,wchar_t(nc));
  577. {$endif not beos}
  578. if (mblen<>size_t(-1)) then
  579. inc(index,mblen)
  580. else
  581. begin
  582. { invalid wide AnsiChar }
  583. p^:='?';
  584. inc(index);
  585. end;
  586. end;
  587. end;
  588. function LowerAnsiString(const s : AnsiString) : AnsiString;
  589. var
  590. i, slen,
  591. resindex : SizeInt;
  592. mblen : size_t;
  593. {$if not(defined(beos) and not defined(haiku))}
  594. ombstate,
  595. nmbstate : mbstate_t;
  596. {$endif beos}
  597. wc : wchar_t;
  598. begin
  599. {$if not(defined(beos) and not defined(haiku))}
  600. fillchar(ombstate,sizeof(ombstate),0);
  601. fillchar(nmbstate,sizeof(nmbstate),0);
  602. {$endif beos}
  603. slen:=length(s);
  604. SetLength(result,slen+10);
  605. i:=1;
  606. resindex:=1;
  607. while (i<=slen) do
  608. begin
  609. if (s[i]<=#127) then
  610. begin
  611. wc:=wchar_t(s[i]);
  612. mblen:= 1;
  613. end
  614. else
  615. {$if not(defined(beos) and not defined(haiku))}
  616. mblen:=mbrtowc(@wc, PAnsiChar(@s[i]), slen-i+1, @ombstate);
  617. {$else not beos}
  618. mblen:=mbtowc(@wc, PAnsiChar(@s[i]), slen-i+1);
  619. {$endif not beos}
  620. case mblen of
  621. size_t(-2):
  622. begin
  623. { partial invalid character, copy literally }
  624. while (i<=slen) do
  625. begin
  626. ConcatCharToAnsiStr(s[i],result,resindex);
  627. inc(i);
  628. end;
  629. end;
  630. size_t(-1), 0:
  631. begin
  632. { invalid or null character }
  633. ConcatCharToAnsiStr(s[i],result,resindex);
  634. inc(i);
  635. end;
  636. else
  637. begin
  638. { a valid sequence }
  639. { even if mblen = 1, the lowercase version may have a }
  640. { different length }
  641. { We can't do anything special if wchar_t is 16 bit... }
  642. {$if not(defined(beos) and not defined(haiku))}
  643. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  644. {$else not beos}
  645. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  646. {$endif not beos}
  647. inc(i,mblen);
  648. end;
  649. end;
  650. end;
  651. SetLength(result,resindex-1);
  652. end;
  653. function UpperAnsiString(const s : AnsiString) : AnsiString;
  654. var
  655. i, slen,
  656. resindex : SizeInt;
  657. mblen : size_t;
  658. {$if not(defined(beos) and not defined(haiku))}
  659. ombstate,
  660. nmbstate : mbstate_t;
  661. {$endif beos}
  662. wc : wchar_t;
  663. begin
  664. {$if not(defined(beos) and not defined(haiku))}
  665. fillchar(ombstate,sizeof(ombstate),0);
  666. fillchar(nmbstate,sizeof(nmbstate),0);
  667. {$endif beos}
  668. slen:=length(s);
  669. SetLength(result,slen+10);
  670. i:=1;
  671. resindex:=1;
  672. while (i<=slen) do
  673. begin
  674. if (s[i]<=#127) then
  675. begin
  676. wc:=wchar_t(s[i]);
  677. mblen:= 1;
  678. end
  679. else
  680. {$if not(defined(beos) and not defined(haiku))}
  681. mblen:=mbrtowc(@wc, PAnsiChar(@s[i]), slen-i+1, @ombstate);
  682. {$else not beos}
  683. mblen:=mbtowc(@wc, PAnsiChar(@s[i]), slen-i+1);
  684. {$endif beos}
  685. case mblen of
  686. size_t(-2):
  687. begin
  688. { partial invalid character, copy literally }
  689. while (i<=slen) do
  690. begin
  691. ConcatCharToAnsiStr(s[i],result,resindex);
  692. inc(i);
  693. end;
  694. end;
  695. size_t(-1), 0:
  696. begin
  697. { invalid or null character }
  698. ConcatCharToAnsiStr(s[i],result,resindex);
  699. inc(i);
  700. end;
  701. else
  702. begin
  703. { a valid sequence }
  704. { even if mblen = 1, the uppercase version may have a }
  705. { different length }
  706. { We can't do anything special if wchar_t is 16 bit... }
  707. {$if not(defined(beos) and not defined(haiku))}
  708. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  709. {$else not beos}
  710. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  711. {$endif not beos}
  712. inc(i,mblen);
  713. end;
  714. end;
  715. end;
  716. SetLength(result,resindex-1);
  717. end;
  718. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  719. var
  720. i, slen,
  721. destindex : SizeInt;
  722. uch : UCS4Char;
  723. begin
  724. slen:=length(s);
  725. setlength(result,slen+1);
  726. i:=1;
  727. destindex:=0;
  728. while (i<=slen) do
  729. begin
  730. uch:=UCS4Char(s[i]);
  731. if (uch=0) then
  732. result[destindex]:=32
  733. else if (uch<=$d7ff) or (uch>=$e000) then
  734. result[destindex]:=uch
  735. else if (uch<=$dbff) and
  736. (i<slen) and
  737. (s[i+1]>=#$dc00) and
  738. (s[i+1]<=#$dfff) then
  739. begin
  740. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  741. inc(i);
  742. end
  743. else { invalid surrogate pair }
  744. result[destindex]:=uch;
  745. inc(i);
  746. inc(destindex);
  747. end;
  748. result[destindex]:=UCS4Char(0);
  749. { Trimming length in this particular case is just a waste of time,
  750. because result will be interpreted as null-terminated and discarded
  751. almost immediately }
  752. end;
  753. function CompareWideString(const s1, s2 : WideString; Options : TCompareOptions) : PtrInt;
  754. {$if not(defined (aix) and defined(cpupowerpc32))}
  755. var
  756. hs1,hs2 : UCS4String;
  757. us1,us2 : WideString;
  758. begin
  759. { wcscoll interprets null chars as end-of-string -> filter out }
  760. if coIgnoreCase in Options then
  761. begin
  762. us1:=UpperWideString(s1);
  763. us2:=UpperWideString(s2);
  764. end
  765. else
  766. begin
  767. us1:=s1;
  768. us2:=s2;
  769. end;
  770. hs1:=WideStringToUCS4StringNoNulls(us1);
  771. hs2:=WideStringToUCS4StringNoNulls(us2);
  772. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  773. end;
  774. {$else}
  775. { AIX/PPC32 has a 16 bit wchar_t }
  776. var
  777. i, len: longint;
  778. us1,us2 : WideString;
  779. hs1, hs2: array of widechar;
  780. begin
  781. if coIgnoreCase in Options then
  782. begin
  783. us1:=UpperWideString(s1);
  784. us2:=UpperWideString(s2);
  785. end
  786. else
  787. begin
  788. us1:=s1;
  789. us2:=s2;
  790. end;
  791. len:=length(us1);
  792. setlength(hs1,len+1);
  793. for i:=1 to len do
  794. if us1[i]<>#0 then
  795. hs1[i-1]:=us1[i]
  796. else
  797. hs1[i-1]:=#32;
  798. hs1[len]:=#0;
  799. len:=length(us2);
  800. setlength(hs2,len+1);
  801. for i:=1 to len do
  802. if us2[i]<>#0 then
  803. hs2[i-1]:=us2[i]
  804. else
  805. hs2[i-1]:=#32;
  806. hs2[len]:=#0;
  807. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  808. end;
  809. {$endif}
  810. { return value: number of code points in the string. Whenever an invalid
  811. code point is encountered, all characters part of this invalid code point
  812. are considered to form one "character" and the next character is
  813. considered to be the start of a new (possibly also invalid) code point }
  814. function CharLengthPChar(const Str: PAnsiChar): PtrInt;
  815. var
  816. nextlen: ptrint;
  817. s: PAnsiChar;
  818. {$if not(defined(beos) and not defined(haiku))}
  819. mbstate: mbstate_t;
  820. {$endif not beos}
  821. begin
  822. result:=0;
  823. s:=str;
  824. {$if not(defined(beos) and not defined(haiku))}
  825. fillchar(mbstate,sizeof(mbstate),0);
  826. {$endif not beos}
  827. repeat
  828. {$if defined(beos) and not defined(haiku)}
  829. nextlen:=ptrint(mblen(s,MB_CUR_MAX));
  830. {$else beos}
  831. nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate));
  832. {$endif beos}
  833. { skip invalid/incomplete sequences }
  834. if (nextlen<0) then
  835. nextlen:=1;
  836. inc(result,1);
  837. inc(s,nextlen);
  838. until (nextlen=0);
  839. end;
  840. function CodePointLength(const Str: PAnsiChar; maxlookahead: ptrint): PtrInt;
  841. {$if not(defined(beos) and not defined(haiku))}
  842. var
  843. mbstate: mbstate_t;
  844. {$endif not beos}
  845. begin
  846. {$if defined(beos) and not defined(haiku)}
  847. result:=ptrint(mblen(str,maxlookahead));
  848. {$else beos}
  849. fillchar(mbstate,sizeof(mbstate),0);
  850. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  851. { mbrlen can also return -2 for "incomplete but potially valid character
  852. and data has been processed" }
  853. if result<0 then
  854. result:=-1;
  855. {$endif beos}
  856. end;
  857. function StrCompAnsiIntern(s1,s2 : PAnsiChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  858. var
  859. a,b: PAnsiChar;
  860. i: PtrInt;
  861. begin
  862. if not(canmodifys1) then
  863. getmem(a,len1+1)
  864. else
  865. a:=s1;
  866. for i:=0 to len1-1 do
  867. if s1[i]<>#0 then
  868. a[i]:=s1[i]
  869. else
  870. a[i]:=#32;
  871. a[len1]:=#0;
  872. if not(canmodifys2) then
  873. getmem(b,len2+1)
  874. else
  875. b:=s2;
  876. for i:=0 to len2-1 do
  877. if s2[i]<>#0 then
  878. b[i]:=s2[i]
  879. else
  880. b[i]:=#32;
  881. b[len2]:=#0;
  882. result:=strcoll(a,b);
  883. if not(canmodifys1) then
  884. freemem(a);
  885. if not(canmodifys2) then
  886. freemem(b);
  887. end;
  888. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  889. begin
  890. result:=StrCompAnsiIntern(PAnsiChar(s1),PAnsiChar(s2),length(s1),length(s2),false,false);
  891. end;
  892. function StrCompAnsi(s1,s2 : PAnsiChar): PtrInt;
  893. begin
  894. result:=strcoll(s1,s2);
  895. end;
  896. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  897. var
  898. a, b: AnsiString;
  899. begin
  900. a:=UpperAnsistring(s1);
  901. b:=UpperAnsistring(s2);
  902. result:=StrCompAnsiIntern(PAnsiChar(a),PAnsiChar(b),length(a),length(b),true,true);
  903. end;
  904. function AnsiStrIComp(S1, S2: PAnsiChar): PtrInt;
  905. begin
  906. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  907. end;
  908. function AnsiStrLComp(S1, S2: PAnsiChar; MaxLen: PtrUInt): PtrInt;
  909. var
  910. a, b: PAnsiChar;
  911. begin
  912. if (maxlen=0) then
  913. exit(0);
  914. if (s1[maxlen]<>#0) then
  915. begin
  916. getmem(a,maxlen+1);
  917. move(s1^,a^,maxlen);
  918. a[maxlen]:=#0;
  919. end
  920. else
  921. a:=s1;
  922. if (s2[maxlen]<>#0) then
  923. begin
  924. getmem(b,maxlen+1);
  925. move(s2^,b^,maxlen);
  926. b[maxlen]:=#0;
  927. end
  928. else
  929. b:=s2;
  930. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  931. if (a<>s1) then
  932. freemem(a);
  933. if (b<>s2) then
  934. freemem(b);
  935. end;
  936. function AnsiStrLIComp(S1, S2: PAnsiChar; MaxLen: PtrUInt): PtrInt;
  937. var
  938. a, b: ansistring;
  939. begin
  940. if (maxlen=0) then
  941. exit(0);
  942. setlength(a,maxlen);
  943. move(s1^,a[1],maxlen);
  944. setlength(b,maxlen);
  945. move(s2^,b[1],maxlen);
  946. result:=AnsiCompareText(a,b);
  947. end;
  948. procedure ansi2pchar(const s: ansistring; const orgp: PAnsiChar; out p: PAnsiChar);
  949. var
  950. newlen: sizeint;
  951. begin
  952. newlen:=length(s);
  953. if newlen>strlen(orgp) then
  954. fpc_rangeerror;
  955. p:=orgp;
  956. if (newlen>0) then
  957. move(s[1],p[0],newlen);
  958. p[newlen]:=#0;
  959. end;
  960. function AnsiStrLower(Str: PAnsiChar): PAnsiChar;
  961. var
  962. temp: ansistring;
  963. begin
  964. temp:=loweransistring(str);
  965. ansi2pchar(temp,str,result);
  966. end;
  967. function AnsiStrUpper(Str: PAnsiChar): PAnsiChar;
  968. var
  969. temp: ansistring;
  970. begin
  971. temp:=upperansistring(str);
  972. ansi2pchar(temp,str,result);
  973. end;
  974. function envvarset(const varname: PAnsiChar): boolean;
  975. var
  976. varval: PAnsiChar;
  977. begin
  978. varval:=fpgetenv(varname);
  979. result:=
  980. assigned(varval) and
  981. (varval[0]<>#0);
  982. end;
  983. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  984. var
  985. langinfo: PAnsiChar;
  986. begin
  987. {$ifdef FPCRTL_FILESYSTEM_UTF8}
  988. if stdcp=scpFileSystemSingleByte then
  989. begin
  990. result:=CP_UTF8;
  991. exit;
  992. end;
  993. {$endif}
  994. { if none of the relevant LC_* environment variables are set, fall back to
  995. UTF-8 (this happens under some versions of OS X for GUI applications, which
  996. otherwise get CP_ASCII) }
  997. if envvarset('LC_ALL') or
  998. envvarset('LC_CTYPE') or
  999. envvarset('LANG') then
  1000. begin
  1001. langinfo:=nl_langinfo(CODESET);
  1002. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  1003. that causes it to return an empty string of UTF-8 locales
  1004. -> patch up (and in general, UTF-8 is a good default on
  1005. Unix platforms) }
  1006. if not assigned(langinfo) or
  1007. (langinfo^=#0) then
  1008. langinfo:='UTF-8';
  1009. Result:=GetCodepageByName(ansistring(langinfo));
  1010. end
  1011. else
  1012. Result:=unixcp.GetSystemCodepage;
  1013. end;
  1014. {$ifdef FPC_HAS_CPSTRING}
  1015. procedure SetStdIOCodePage(var T: Text); inline;
  1016. begin
  1017. case TextRec(T).Mode of
  1018. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  1019. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  1020. end;
  1021. end;
  1022. procedure SetStdIOCodePages; inline;
  1023. begin
  1024. SetStdIOCodePage(Input);
  1025. SetStdIOCodePage(Output);
  1026. SetStdIOCodePage(ErrOutput);
  1027. SetStdIOCodePage(StdOut);
  1028. SetStdIOCodePage(StdErr);
  1029. end;
  1030. {$endif FPC_HAS_CPSTRING}
  1031. var
  1032. OrgWideStringManager: TUnicodeStringManager;
  1033. Procedure SetCWideStringManager;
  1034. Var
  1035. CWideStringManager : TUnicodeStringManager;
  1036. begin
  1037. GetUnicodeStringManager(OrgWideStringManager);
  1038. CWideStringManager:=OrgWideStringManager;
  1039. With CWideStringManager do
  1040. begin
  1041. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  1042. Ansi2WideMoveProc:=@Ansi2WideMove;
  1043. UpperWideStringProc:=@UpperWideString;
  1044. LowerWideStringProc:=@LowerWideString;
  1045. CompareWideStringProc:=@CompareWideString;
  1046. // CompareTextWideStringProc:=@CompareTextWideString;
  1047. CharLengthPCharProc:=@CharLengthPChar;
  1048. CodePointLengthProc:=@CodePointLength;
  1049. UpperAnsiStringProc:=@UpperAnsiString;
  1050. LowerAnsiStringProc:=@LowerAnsiString;
  1051. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  1052. CompareTextAnsiStringProc:=@AnsiCompareText;
  1053. StrCompAnsiStringProc:=@StrCompAnsi;
  1054. StrICompAnsiStringProc:=@AnsiStrIComp;
  1055. StrLCompAnsiStringProc:=@AnsiStrLComp;
  1056. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  1057. StrLowerAnsiStringProc:=@AnsiStrLower;
  1058. StrUpperAnsiStringProc:=@AnsiStrUpper;
  1059. ThreadInitProc:=@InitThread;
  1060. ThreadFiniProc:=@FiniThread;
  1061. { Unicode }
  1062. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  1063. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  1064. UpperUnicodeStringProc:=@UpperWideString;
  1065. LowerUnicodeStringProc:=@LowerWideString;
  1066. CompareUnicodeStringProc:=@CompareWideString;
  1067. { CodePage }
  1068. GetStandardCodePageProc:=@GetStandardCodePage;
  1069. end;
  1070. SetUnicodeStringManager(CWideStringManager);
  1071. end;
  1072. var
  1073. iconvlib:TLibHandle;
  1074. initialization
  1075. SetCWideStringManager;
  1076. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  1077. { with the information from the environment variables according to POSIX }
  1078. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  1079. setlocale(LC_ALL,'');
  1080. { load iconv library and iconvctl function }
  1081. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  1082. if iconvlib=0 then
  1083. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix+'.6');
  1084. if iconvlib<>0 then
  1085. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  1086. { set the DefaultSystemCodePage }
  1087. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  1088. DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
  1089. DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
  1090. {$ifdef FPC_HAS_CPSTRING}
  1091. SetStdIOCodePages;
  1092. {$endif FPC_HAS_CPSTRING}
  1093. { init conversion tables for main program }
  1094. InitThread;
  1095. finalization
  1096. { fini conversion tables for main program }
  1097. FiniThread;
  1098. { unload iconv library }
  1099. if iconvlib<>0 then
  1100. FreeLibrary(iconvlib);
  1101. { restore previous (probably default) widestring manager so that subsequent calls
  1102. into the widestring manager won't trigger the finalized functionality }
  1103. SetWideStringManager(OrgWideStringManager);
  1104. end.