cwstring.pp 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(linux) or defined(solaris)}
  22. {$define iconv_is_in_libc}
  23. {$endif}
  24. {$ifdef netbsd}
  25. {$ifndef DISABLE_ICONV_LIBC}
  26. {$define iconv_is_in_libc}
  27. {$endif}
  28. {$endif}
  29. { Modern FreeBSD releases also claim to have iconv
  30. function inside libc, at least
  31. this is stated explicitly in iconv(3) man
  32. of FreeBSD from 10.0 release
  33. PM 2023-05-05 }
  34. {$ifdef freebsd}
  35. {$ifndef DISABLE_ICONV_LIBC}
  36. {$define iconv_is_in_libc}
  37. {$endif}
  38. {$endif}
  39. {$ifndef iconv_is_in_libc}
  40. {$if defined(haiku)}
  41. {$linklib textencoding}
  42. {$else}
  43. {$linklib iconv}
  44. {$endif}
  45. {$define useiconv}
  46. {$endif not iconv_is_in_libc}
  47. {$i rtldefs.inc}
  48. Uses
  49. BaseUnix,
  50. ctypes,
  51. unix,
  52. unixtype,
  53. initc,
  54. dynlibs,
  55. unixcp;
  56. Const
  57. {$ifndef useiconv}
  58. libiconvname='c'; // is in libc under Linux.
  59. libprefix='lib';
  60. {$else}
  61. {$ifdef haiku}
  62. libiconvname='textencoding'; // is in libtextencoding under Haiku
  63. libprefix='lib';
  64. {$else}
  65. {$if defined(darwin) or defined(aix)}
  66. libiconvname='libiconv';
  67. libprefix='';
  68. {$else}
  69. libiconvname='iconv';
  70. libprefix='lib';
  71. {$endif}
  72. {$endif}
  73. {$endif}
  74. { helper functions from libc }
  75. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  76. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  77. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  78. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  79. {$ifdef netbsd}
  80. { NetBSD has a new setlocale function defined in /usr/include/locale.h
  81. that should be used }
  82. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name '__setlocale50';
  83. {$else}
  84. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  85. {$endif}
  86. {$if not(defined(beos) and not defined(haiku))}
  87. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  88. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  89. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  90. {$else beos}
  91. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  92. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  93. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  94. {$endif beos}
  95. const
  96. {$if defined(linux)}
  97. __LC_CTYPE = 0;
  98. LC_ALL = 6;
  99. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  100. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  101. CODESET = _NL_CTYPE_CODESET_NAME;
  102. {$elseif defined(darwin)}
  103. CODESET = 0;
  104. LC_ALL = 0;
  105. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  106. __LC_CTYPE = 0;
  107. LC_ALL = 0;
  108. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  109. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  110. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  111. {$elseif defined(solaris)}
  112. {$define ACCEPT_646}
  113. CODESET=49;
  114. LC_ALL = 6;
  115. {$elseif defined(beos)}
  116. {$ifdef haiku}
  117. CODESET= 0; // Checked for Haiku
  118. LC_ALL = 0; // Checked for Haiku
  119. {$else}
  120. {$warning check correct value for BeOS}
  121. CODESET=49;
  122. LC_ALL = 6; // Checked for BeOS
  123. {$endif}
  124. ESysEILSEQ = EILSEQ;
  125. {$elseif defined(OpenBSD)}
  126. {$define ACCEPT_646}
  127. CODESET = 51;
  128. LC_ALL = 0;
  129. {$elseif defined(NetBSD)}
  130. {$define ACCEPT_646}
  131. CODESET = 51;
  132. LC_ALL = 0;
  133. {$elseif defined(aix)}
  134. CODESET = 49;
  135. LC_ALL = -1;
  136. {$elseif defined(dragonfly)}
  137. CODESET = 0;
  138. LC_ALL = 0;
  139. __LC_CTYPE = 0;
  140. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  141. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  142. {$else not aix}
  143. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  144. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  145. {$endif}
  146. { unicode encoding name }
  147. {$ifdef FPC_LITTLE_ENDIAN}
  148. unicode_encoding2 = 'UTF-16LE';
  149. unicode_encoding4 = 'UCS-4LE';
  150. {$else FPC_LITTLE_ENDIAN}
  151. {$ifdef AIX}
  152. unicode_encoding2 = 'UTF-16';
  153. {$else AIX}
  154. unicode_encoding2 = 'UTF-16BE';
  155. unicode_encoding4 = 'UCS-4BE';
  156. {$endif AIX}
  157. {$endif FPC_LITTLE_ENDIAN}
  158. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  159. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  160. { that's a libc macro mapped to internal functions/variables }
  161. { and thus not a stable external API on systems where libc }
  162. { breaks backwards compatibility every now and then }
  163. MB_CUR_MAX = 10;
  164. { Requests for iconvctl }
  165. ICONV_TRIVIALP = 0; // int *argument
  166. ICONV_GET_TRANSLITERATE = 1; // int *argument
  167. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  168. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  169. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  170. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  171. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  172. type
  173. piconv_t = ^iconv_t;
  174. iconv_t = pointer;
  175. nl_item = cint;
  176. {$ifdef haiku}
  177. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'root' name 'nl_langinfo';
  178. {$else}
  179. {$ifndef beos}
  180. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  181. {$endif}
  182. {$endif}
  183. {$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
  184. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  185. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  186. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  187. const
  188. iconvctlname='iconvctl';
  189. {$else}
  190. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  191. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  192. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  193. const
  194. iconvctlname='libiconvctl';
  195. {$endif}
  196. var
  197. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  198. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  199. threadvar
  200. iconv_ansi2wide,
  201. iconv_wide2ansi : iconv_t;
  202. { since we cache the iconv_t converters, we have to do the same
  203. for the DefaultSystemCodePage variable since if it changes, we
  204. have to re-initialize the converters too. We can't do that via
  205. a callback in the widestring manager because DefaultSystemCodePage
  206. is not a threadvar and we can't automatically change this in all
  207. threads }
  208. current_DefaultSystemCodePage: TSystemCodePage;
  209. procedure InitThread;
  210. var
  211. transliterate: cint;
  212. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  213. iconvindex: longint;
  214. {$endif}
  215. iconvname, toencoding: rawbytestring;
  216. begin
  217. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  218. {$if declared(iconvindex)}
  219. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  220. if iconvindex<>-1 then
  221. iconvname:=UnixCpMap[iconvindex].name
  222. else
  223. { default to UTF-8 on Unix platforms }
  224. iconvname:='UTF-8';
  225. {$else}
  226. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  227. iconvname:='UTF-8';
  228. {$endif}
  229. toencoding:=iconvname;
  230. if not assigned(iconvctl) then
  231. toencoding:=toencoding+'//TRANSLIT';
  232. iconv_wide2ansi:=iconv_open(pchar(toencoding),unicode_encoding2);
  233. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  234. if assigned(iconvctl) and
  235. (iconv_wide2ansi<>iconv_t(-1)) then
  236. begin
  237. transliterate:=1;
  238. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  239. end;
  240. end;
  241. procedure FiniThread;
  242. begin
  243. if (iconv_wide2ansi <> iconv_t(-1)) then
  244. iconv_close(iconv_wide2ansi);
  245. if (iconv_ansi2wide <> iconv_t(-1)) then
  246. iconv_close(iconv_ansi2wide);
  247. end;
  248. {$if defined(beos) and not defined(haiku)}
  249. function nl_langinfo(__item:nl_item):pchar;
  250. begin
  251. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  252. // Now implement the minimum required to correctly initialize WideString support
  253. case __item of
  254. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  255. else
  256. begin
  257. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  258. Result := '';
  259. end;
  260. end;
  261. end;
  262. {$endif}
  263. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
  264. var
  265. iconvindex: longint;
  266. toencoding: rawbytestring;
  267. transliterate: cint;
  268. begin
  269. { TODO: add caching (then we also don't need separate code for
  270. the default system page and other ones)
  271. -- typecasting an ansistring function result to pchar is
  272. unsafe normally, but these are constant strings -> no
  273. problem }
  274. open_iconv_for_cps:=iconv_t(-1);
  275. iconvindex:=GetCodepageData(cp);
  276. if iconvindex=-1 then
  277. exit;
  278. repeat
  279. if cp_is_from then
  280. open_iconv_for_cps:=iconv_open(otherencoding,pchar(UnixCpMap[iconvindex].name))
  281. else
  282. begin
  283. toencoding:=UnixCpMap[iconvindex].name;
  284. if not assigned(iconvctl) then
  285. toencoding:=toencoding+'//TRANSLIT';
  286. open_iconv_for_cps:=iconv_open(pchar(toencoding),otherencoding);
  287. end;
  288. inc(iconvindex);
  289. until (open_iconv_for_cps<>iconv_t(-1)) or
  290. (iconvindex>high(UnixCpMap)) or
  291. (UnixCpMap[iconvindex].cp<>cp);
  292. if not cp_is_from and
  293. (open_iconv_for_cps<>iconv_t(-1)) and
  294. assigned(iconvctl) then
  295. begin
  296. transliterate:=1;
  297. iconvctl(open_iconv_for_cps,ICONV_SET_TRANSLITERATE,@transliterate);
  298. end;
  299. end;
  300. {$ifdef aix}
  301. {$i cwstraix.inc}
  302. {$endif aix}
  303. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  304. var
  305. outlength,
  306. outoffset,
  307. srclen,
  308. outleft : size_t;
  309. use_iconv: iconv_t;
  310. srcpos : pwidechar;
  311. destpos: pchar;
  312. mynil : pchar;
  313. my0 : size_t;
  314. err : longint;
  315. transliterate: cint;
  316. free_iconv: boolean;
  317. {$ifdef aix}
  318. intermediate: rawbytestring;
  319. {$endif aix}
  320. begin
  321. {$ifdef aix}
  322. { AIX libiconv does not support converting cp866 to anything else except
  323. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  324. if cp=866 then
  325. begin
  326. Wide2AnsiMove(source,intermediate,28595,len);
  327. if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
  328. exit;
  329. end;
  330. {$endif aix}
  331. if (cp=DefaultSystemCodePage) then
  332. begin
  333. { update iconv converter in case the DefaultSystemCodePage has been
  334. changed }
  335. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  336. begin
  337. FiniThread;
  338. InitThread;
  339. end;
  340. use_iconv:=iconv_wide2ansi;
  341. free_iconv:=false;
  342. end
  343. else
  344. begin
  345. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  346. if (use_iconv<>iconv_t(-1)) and
  347. assigned(iconvctl) then
  348. begin
  349. transliterate:=1;
  350. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  351. end;
  352. free_iconv:=true;
  353. end;
  354. { unsupported encoding -> default move }
  355. if use_iconv=iconv_t(-1) then
  356. begin
  357. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  358. exit;
  359. end;
  360. mynil:=nil;
  361. my0:=0;
  362. { rought estimation }
  363. setlength(dest,len*3);
  364. outlength:=len*3;
  365. srclen:=len*2;
  366. srcpos:=source;
  367. destpos:=pchar(dest);
  368. outleft:=outlength;
  369. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  370. begin
  371. err:=fpgetCerrno;
  372. case err of
  373. { last character is incomplete sequence }
  374. ESysEINVAL,
  375. { incomplete sequence in the middle }
  376. ESysEILSEQ:
  377. begin
  378. { skip and set to '?' }
  379. inc(srcpos);
  380. dec(srclen,2);
  381. destpos^:='?';
  382. inc(destpos);
  383. dec(outleft);
  384. { reset }
  385. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  386. if err=ESysEINVAL then
  387. break;
  388. end;
  389. ESysE2BIG:
  390. begin
  391. outoffset:=destpos-pchar(dest);
  392. { extend }
  393. setlength(dest,outlength+len*3);
  394. inc(outleft,len*3);
  395. inc(outlength,len*3);
  396. { string could have been moved }
  397. destpos:=pchar(dest)+outoffset;
  398. end;
  399. else
  400. runerror(231);
  401. end;
  402. end;
  403. // truncate string
  404. setlength(dest,length(dest)-outleft);
  405. SetCodePage(dest,cp,false);
  406. if free_iconv then
  407. iconv_close(use_iconv);
  408. end;
  409. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  410. var
  411. outlength,
  412. outoffset,
  413. outleft : size_t;
  414. use_iconv: iconv_t;
  415. srcpos,
  416. destpos: pchar;
  417. mynil : pchar;
  418. my0 : size_t;
  419. err: cint;
  420. iconvindex: longint;
  421. free_iconv: boolean;
  422. {$ifdef aix}
  423. intermediate: rawbytestring;
  424. {$endif aix}
  425. begin
  426. {$ifdef aix}
  427. { AIX libiconv does not support converting cp866 to anything else except
  428. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  429. if cp=866 then
  430. begin
  431. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  432. source:=pchar(intermediate);
  433. end;
  434. {$endif aix}
  435. if (cp=DefaultSystemCodePage) then
  436. begin
  437. { update iconv converter in case the DefaultSystemCodePage has been
  438. changed }
  439. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  440. begin
  441. FiniThread;
  442. InitThread;
  443. end;
  444. use_iconv:=iconv_ansi2wide;
  445. free_iconv:=false;
  446. end
  447. else
  448. begin
  449. { TODO: add caching (then we also don't need separate code for
  450. the default system page and other ones)
  451. -- typecasting an ansistring function result to pchar is
  452. unsafe normally, but these are constant strings -> no
  453. problem }
  454. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  455. free_iconv:=true;
  456. end;
  457. { unsupported encoding -> default move }
  458. if use_iconv=iconv_t(-1) then
  459. begin
  460. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  461. exit;
  462. end;
  463. mynil:=nil;
  464. my0:=0;
  465. // extra space
  466. outlength:=len+1;
  467. setlength(dest,outlength);
  468. srcpos:=source;
  469. destpos:=pchar(dest);
  470. outleft:=outlength*2;
  471. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  472. begin
  473. err:=fpgetCerrno;
  474. case err of
  475. ESysEINVAL,
  476. ESysEILSEQ:
  477. begin
  478. { skip and set to '?' }
  479. inc(srcpos);
  480. dec(len);
  481. pwidechar(destpos)^:='?';
  482. inc(destpos,2);
  483. dec(outleft,2);
  484. { reset }
  485. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  486. if err=ESysEINVAL then
  487. break;
  488. end;
  489. ESysE2BIG:
  490. begin
  491. outoffset:=destpos-pchar(dest);
  492. { extend }
  493. setlength(dest,outlength+len);
  494. inc(outleft,len*2);
  495. inc(outlength,len);
  496. { string could have been moved }
  497. destpos:=pchar(dest)+outoffset;
  498. end;
  499. else
  500. runerror(231);
  501. end;
  502. end;
  503. // truncate string
  504. setlength(dest,length(dest)-outleft div 2);
  505. if free_iconv then
  506. iconv_close(use_iconv);
  507. end;
  508. function LowerWideString(const s : WideString) : WideString;
  509. var
  510. i : SizeInt;
  511. begin
  512. SetLength(result,length(s));
  513. for i:=0 to length(s)-1 do
  514. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  515. end;
  516. function UpperWideString(const s : WideString) : WideString;
  517. var
  518. i : SizeInt;
  519. begin
  520. SetLength(result,length(s));
  521. for i:=0 to length(s)-1 do
  522. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  523. end;
  524. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  525. begin
  526. if (len>length(s)) then
  527. if (length(s) < 10*256) then
  528. setlength(s,length(s)+10)
  529. else
  530. setlength(s,length(s)+length(s) shr 8);
  531. end;
  532. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  533. begin
  534. EnsureAnsiLen(s,index);
  535. pchar(@s[index])^:=c;
  536. inc(index);
  537. end;
  538. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  539. {$if not(defined(beos) and not defined(haiku))}
  540. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  541. {$else not beos}
  542. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  543. {$endif beos}
  544. var
  545. p : pchar;
  546. mblen : size_t;
  547. begin
  548. { we know that s is unique -> avoid uniquestring calls}
  549. p:=@s[index];
  550. if (nc<=127) then
  551. ConcatCharToAnsiStr(char(nc),s,index)
  552. else
  553. begin
  554. EnsureAnsiLen(s,index+MB_CUR_MAX);
  555. {$if not(defined(beos) and not defined(haiku))}
  556. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  557. {$else not beos}
  558. mblen:=wctomb(p,wchar_t(nc));
  559. {$endif not beos}
  560. if (mblen<>size_t(-1)) then
  561. inc(index,mblen)
  562. else
  563. begin
  564. { invalid wide char }
  565. p^:='?';
  566. inc(index);
  567. end;
  568. end;
  569. end;
  570. function LowerAnsiString(const s : AnsiString) : AnsiString;
  571. var
  572. i, slen,
  573. resindex : SizeInt;
  574. mblen : size_t;
  575. {$if not(defined(beos) and not defined(haiku))}
  576. ombstate,
  577. nmbstate : mbstate_t;
  578. {$endif beos}
  579. wc : wchar_t;
  580. begin
  581. {$if not(defined(beos) and not defined(haiku))}
  582. fillchar(ombstate,sizeof(ombstate),0);
  583. fillchar(nmbstate,sizeof(nmbstate),0);
  584. {$endif beos}
  585. slen:=length(s);
  586. SetLength(result,slen+10);
  587. i:=1;
  588. resindex:=1;
  589. while (i<=slen) do
  590. begin
  591. if (s[i]<=#127) then
  592. begin
  593. wc:=wchar_t(s[i]);
  594. mblen:= 1;
  595. end
  596. else
  597. {$if not(defined(beos) and not defined(haiku))}
  598. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  599. {$else not beos}
  600. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  601. {$endif not beos}
  602. case mblen of
  603. size_t(-2):
  604. begin
  605. { partial invalid character, copy literally }
  606. while (i<=slen) do
  607. begin
  608. ConcatCharToAnsiStr(s[i],result,resindex);
  609. inc(i);
  610. end;
  611. end;
  612. size_t(-1), 0:
  613. begin
  614. { invalid or null character }
  615. ConcatCharToAnsiStr(s[i],result,resindex);
  616. inc(i);
  617. end;
  618. else
  619. begin
  620. { a valid sequence }
  621. { even if mblen = 1, the lowercase version may have a }
  622. { different length }
  623. { We can't do anything special if wchar_t is 16 bit... }
  624. {$if not(defined(beos) and not defined(haiku))}
  625. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  626. {$else not beos}
  627. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  628. {$endif not beos}
  629. inc(i,mblen);
  630. end;
  631. end;
  632. end;
  633. SetLength(result,resindex-1);
  634. end;
  635. function UpperAnsiString(const s : AnsiString) : AnsiString;
  636. var
  637. i, slen,
  638. resindex : SizeInt;
  639. mblen : size_t;
  640. {$if not(defined(beos) and not defined(haiku))}
  641. ombstate,
  642. nmbstate : mbstate_t;
  643. {$endif beos}
  644. wc : wchar_t;
  645. begin
  646. {$if not(defined(beos) and not defined(haiku))}
  647. fillchar(ombstate,sizeof(ombstate),0);
  648. fillchar(nmbstate,sizeof(nmbstate),0);
  649. {$endif beos}
  650. slen:=length(s);
  651. SetLength(result,slen+10);
  652. i:=1;
  653. resindex:=1;
  654. while (i<=slen) do
  655. begin
  656. if (s[i]<=#127) then
  657. begin
  658. wc:=wchar_t(s[i]);
  659. mblen:= 1;
  660. end
  661. else
  662. {$if not(defined(beos) and not defined(haiku))}
  663. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  664. {$else not beos}
  665. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  666. {$endif beos}
  667. case mblen of
  668. size_t(-2):
  669. begin
  670. { partial invalid character, copy literally }
  671. while (i<=slen) do
  672. begin
  673. ConcatCharToAnsiStr(s[i],result,resindex);
  674. inc(i);
  675. end;
  676. end;
  677. size_t(-1), 0:
  678. begin
  679. { invalid or null character }
  680. ConcatCharToAnsiStr(s[i],result,resindex);
  681. inc(i);
  682. end;
  683. else
  684. begin
  685. { a valid sequence }
  686. { even if mblen = 1, the uppercase version may have a }
  687. { different length }
  688. { We can't do anything special if wchar_t is 16 bit... }
  689. {$if not(defined(beos) and not defined(haiku))}
  690. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  691. {$else not beos}
  692. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  693. {$endif not beos}
  694. inc(i,mblen);
  695. end;
  696. end;
  697. end;
  698. SetLength(result,resindex-1);
  699. end;
  700. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  701. var
  702. i, slen,
  703. destindex : SizeInt;
  704. uch : UCS4Char;
  705. begin
  706. slen:=length(s);
  707. setlength(result,slen+1);
  708. i:=1;
  709. destindex:=0;
  710. while (i<=slen) do
  711. begin
  712. uch:=UCS4Char(s[i]);
  713. if (uch=0) then
  714. result[destindex]:=32
  715. else if (uch<=$d7ff) or (uch>=$e000) then
  716. result[destindex]:=uch
  717. else if (uch<=$dbff) and
  718. (i<slen) and
  719. (s[i+1]>=#$dc00) and
  720. (s[i+1]<=#$dfff) then
  721. begin
  722. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  723. inc(i);
  724. end
  725. else { invalid surrogate pair }
  726. result[destindex]:=uch;
  727. inc(i);
  728. inc(destindex);
  729. end;
  730. result[destindex]:=UCS4Char(0);
  731. { Trimming length in this particular case is just a waste of time,
  732. because result will be interpreted as null-terminated and discarded
  733. almost immediately }
  734. end;
  735. function CompareWideString(const s1, s2 : WideString; Options : TCompareOptions) : PtrInt;
  736. {$if not(defined (aix) and defined(cpupowerpc32))}
  737. var
  738. hs1,hs2 : UCS4String;
  739. us1,us2 : WideString;
  740. begin
  741. { wcscoll interprets null chars as end-of-string -> filter out }
  742. if coIgnoreCase in Options then
  743. begin
  744. us1:=UpperWideString(s1);
  745. us2:=UpperWideString(s2);
  746. end
  747. else
  748. begin
  749. us1:=s1;
  750. us2:=s2;
  751. end;
  752. hs1:=WideStringToUCS4StringNoNulls(us1);
  753. hs2:=WideStringToUCS4StringNoNulls(us2);
  754. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  755. end;
  756. {$else}
  757. { AIX/PPC32 has a 16 bit wchar_t }
  758. var
  759. i, len: longint;
  760. us1,us2 : WideString;
  761. hs1, hs2: array of widechar;
  762. begin
  763. if coIgnoreCase in Options then
  764. begin
  765. us1:=UpperWideString(s1);
  766. us2:=UpperWideString(s2);
  767. end
  768. else
  769. begin
  770. us1:=s1;
  771. us2:=s2;
  772. end;
  773. len:=length(us1);
  774. setlength(hs1,len+1);
  775. for i:=1 to len do
  776. if us1[i]<>#0 then
  777. hs1[i-1]:=us1[i]
  778. else
  779. hs1[i-1]:=#32;
  780. hs1[len]:=#0;
  781. len:=length(us2);
  782. setlength(hs2,len+1);
  783. for i:=1 to len do
  784. if us2[i]<>#0 then
  785. hs2[i-1]:=us2[i]
  786. else
  787. hs2[i-1]:=#32;
  788. hs2[len]:=#0;
  789. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  790. end;
  791. {$endif}
  792. { return value: number of code points in the string. Whenever an invalid
  793. code point is encountered, all characters part of this invalid code point
  794. are considered to form one "character" and the next character is
  795. considered to be the start of a new (possibly also invalid) code point }
  796. function CharLengthPChar(const Str: PChar): PtrInt;
  797. var
  798. nextlen: ptrint;
  799. s: pchar;
  800. {$if not(defined(beos) and not defined(haiku))}
  801. mbstate: mbstate_t;
  802. {$endif not beos}
  803. begin
  804. result:=0;
  805. s:=str;
  806. {$if not(defined(beos) and not defined(haiku))}
  807. fillchar(mbstate,sizeof(mbstate),0);
  808. {$endif not beos}
  809. repeat
  810. {$if defined(beos) and not defined(haiku)}
  811. nextlen:=ptrint(mblen(s,MB_CUR_MAX));
  812. {$else beos}
  813. nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate));
  814. {$endif beos}
  815. { skip invalid/incomplete sequences }
  816. if (nextlen<0) then
  817. nextlen:=1;
  818. inc(result,1);
  819. inc(s,nextlen);
  820. until (nextlen=0);
  821. end;
  822. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  823. {$if not(defined(beos) and not defined(haiku))}
  824. var
  825. mbstate: mbstate_t;
  826. {$endif not beos}
  827. begin
  828. {$if defined(beos) and not defined(haiku)}
  829. result:=ptrint(mblen(str,maxlookahead));
  830. {$else beos}
  831. fillchar(mbstate,sizeof(mbstate),0);
  832. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  833. { mbrlen can also return -2 for "incomplete but potially valid character
  834. and data has been processed" }
  835. if result<0 then
  836. result:=-1;
  837. {$endif beos}
  838. end;
  839. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  840. var
  841. a,b: pchar;
  842. i: PtrInt;
  843. begin
  844. if not(canmodifys1) then
  845. getmem(a,len1+1)
  846. else
  847. a:=s1;
  848. for i:=0 to len1-1 do
  849. if s1[i]<>#0 then
  850. a[i]:=s1[i]
  851. else
  852. a[i]:=#32;
  853. a[len1]:=#0;
  854. if not(canmodifys2) then
  855. getmem(b,len2+1)
  856. else
  857. b:=s2;
  858. for i:=0 to len2-1 do
  859. if s2[i]<>#0 then
  860. b[i]:=s2[i]
  861. else
  862. b[i]:=#32;
  863. b[len2]:=#0;
  864. result:=strcoll(a,b);
  865. if not(canmodifys1) then
  866. freemem(a);
  867. if not(canmodifys2) then
  868. freemem(b);
  869. end;
  870. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  871. begin
  872. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  873. end;
  874. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  875. begin
  876. result:=strcoll(s1,s2);
  877. end;
  878. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  879. var
  880. a, b: AnsiString;
  881. begin
  882. a:=UpperAnsistring(s1);
  883. b:=UpperAnsistring(s2);
  884. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  885. end;
  886. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  887. begin
  888. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  889. end;
  890. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  891. var
  892. a, b: pchar;
  893. begin
  894. if (maxlen=0) then
  895. exit(0);
  896. if (s1[maxlen]<>#0) then
  897. begin
  898. getmem(a,maxlen+1);
  899. move(s1^,a^,maxlen);
  900. a[maxlen]:=#0;
  901. end
  902. else
  903. a:=s1;
  904. if (s2[maxlen]<>#0) then
  905. begin
  906. getmem(b,maxlen+1);
  907. move(s2^,b^,maxlen);
  908. b[maxlen]:=#0;
  909. end
  910. else
  911. b:=s2;
  912. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  913. if (a<>s1) then
  914. freemem(a);
  915. if (b<>s2) then
  916. freemem(b);
  917. end;
  918. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  919. var
  920. a, b: ansistring;
  921. begin
  922. if (maxlen=0) then
  923. exit(0);
  924. setlength(a,maxlen);
  925. move(s1^,a[1],maxlen);
  926. setlength(b,maxlen);
  927. move(s2^,b[1],maxlen);
  928. result:=AnsiCompareText(a,b);
  929. end;
  930. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  931. var
  932. newlen: sizeint;
  933. begin
  934. newlen:=length(s);
  935. if newlen>strlen(orgp) then
  936. fpc_rangeerror;
  937. p:=orgp;
  938. if (newlen>0) then
  939. move(s[1],p[0],newlen);
  940. p[newlen]:=#0;
  941. end;
  942. function AnsiStrLower(Str: PChar): PChar;
  943. var
  944. temp: ansistring;
  945. begin
  946. temp:=loweransistring(str);
  947. ansi2pchar(temp,str,result);
  948. end;
  949. function AnsiStrUpper(Str: PChar): PChar;
  950. var
  951. temp: ansistring;
  952. begin
  953. temp:=upperansistring(str);
  954. ansi2pchar(temp,str,result);
  955. end;
  956. function envvarset(const varname: pchar): boolean;
  957. var
  958. varval: pchar;
  959. begin
  960. varval:=fpgetenv(varname);
  961. result:=
  962. assigned(varval) and
  963. (varval[0]<>#0);
  964. end;
  965. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  966. var
  967. langinfo: pchar;
  968. begin
  969. {$ifdef FPCRTL_FILESYSTEM_UTF8}
  970. if stdcp=scpFileSystemSingleByte then
  971. begin
  972. result:=CP_UTF8;
  973. exit;
  974. end;
  975. {$endif}
  976. { if none of the relevant LC_* environment variables are set, fall back to
  977. UTF-8 (this happens under some versions of OS X for GUI applications, which
  978. otherwise get CP_ASCII) }
  979. if envvarset('LC_ALL') or
  980. envvarset('LC_CTYPE') or
  981. envvarset('LANG') then
  982. begin
  983. langinfo:=nl_langinfo(CODESET);
  984. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  985. that causes it to return an empty string of UTF-8 locales
  986. -> patch up (and in general, UTF-8 is a good default on
  987. Unix platforms) }
  988. if not assigned(langinfo) or
  989. (langinfo^=#0) then
  990. langinfo:='UTF-8';
  991. Result:=GetCodepageByName(ansistring(langinfo));
  992. end
  993. else
  994. Result:=unixcp.GetSystemCodepage;
  995. end;
  996. {$ifdef FPC_HAS_CPSTRING}
  997. procedure SetStdIOCodePage(var T: Text); inline;
  998. begin
  999. case TextRec(T).Mode of
  1000. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  1001. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  1002. end;
  1003. end;
  1004. procedure SetStdIOCodePages; inline;
  1005. begin
  1006. SetStdIOCodePage(Input);
  1007. SetStdIOCodePage(Output);
  1008. SetStdIOCodePage(ErrOutput);
  1009. SetStdIOCodePage(StdOut);
  1010. SetStdIOCodePage(StdErr);
  1011. end;
  1012. {$endif FPC_HAS_CPSTRING}
  1013. var
  1014. OrgWideStringManager: TUnicodeStringManager;
  1015. Procedure SetCWideStringManager;
  1016. Var
  1017. CWideStringManager : TUnicodeStringManager;
  1018. begin
  1019. GetUnicodeStringManager(OrgWideStringManager);
  1020. CWideStringManager:=OrgWideStringManager;
  1021. With CWideStringManager do
  1022. begin
  1023. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  1024. Ansi2WideMoveProc:=@Ansi2WideMove;
  1025. UpperWideStringProc:=@UpperWideString;
  1026. LowerWideStringProc:=@LowerWideString;
  1027. CompareWideStringProc:=@CompareWideString;
  1028. // CompareTextWideStringProc:=@CompareTextWideString;
  1029. CharLengthPCharProc:=@CharLengthPChar;
  1030. CodePointLengthProc:=@CodePointLength;
  1031. UpperAnsiStringProc:=@UpperAnsiString;
  1032. LowerAnsiStringProc:=@LowerAnsiString;
  1033. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  1034. CompareTextAnsiStringProc:=@AnsiCompareText;
  1035. StrCompAnsiStringProc:=@StrCompAnsi;
  1036. StrICompAnsiStringProc:=@AnsiStrIComp;
  1037. StrLCompAnsiStringProc:=@AnsiStrLComp;
  1038. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  1039. StrLowerAnsiStringProc:=@AnsiStrLower;
  1040. StrUpperAnsiStringProc:=@AnsiStrUpper;
  1041. ThreadInitProc:=@InitThread;
  1042. ThreadFiniProc:=@FiniThread;
  1043. { Unicode }
  1044. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  1045. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  1046. UpperUnicodeStringProc:=@UpperWideString;
  1047. LowerUnicodeStringProc:=@LowerWideString;
  1048. CompareUnicodeStringProc:=@CompareWideString;
  1049. { CodePage }
  1050. GetStandardCodePageProc:=@GetStandardCodePage;
  1051. end;
  1052. SetUnicodeStringManager(CWideStringManager);
  1053. end;
  1054. var
  1055. iconvlib:TLibHandle;
  1056. initialization
  1057. SetCWideStringManager;
  1058. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  1059. { with the information from the environment variables according to POSIX }
  1060. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  1061. setlocale(LC_ALL,'');
  1062. { load iconv library and iconvctl function }
  1063. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  1064. if iconvlib=0 then
  1065. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix+'.6');
  1066. if iconvlib<>0 then
  1067. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  1068. { set the DefaultSystemCodePage }
  1069. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  1070. DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
  1071. DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
  1072. {$ifdef FPC_HAS_CPSTRING}
  1073. SetStdIOCodePages;
  1074. {$endif FPC_HAS_CPSTRING}
  1075. { init conversion tables for main program }
  1076. InitThread;
  1077. finalization
  1078. { fini conversion tables for main program }
  1079. FiniThread;
  1080. { unload iconv library }
  1081. if iconvlib<>0 then
  1082. FreeLibrary(iconvlib);
  1083. { restore previous (probably default) widestring manager so that subsequent calls
  1084. into the widestring manager won't trigger the finalized functionality }
  1085. SetWideStringManager(OrgWideStringManager);
  1086. end.