cwstring.pp 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(linux) or defined(solaris)}
  22. {$define iconv_is_in_libc}
  23. {$endif}
  24. {$ifdef netbsd}
  25. {$ifndef DISABLE_ICONV_LIBC}
  26. {$define iconv_is_in_libc}
  27. {$endif}
  28. {$endif}
  29. { Modern FreeBSD releases also claim to have iconv
  30. function inside libc, at least
  31. this is stated explicitly in iconv(3) man
  32. of FreeBSD from 10.0 release
  33. PM 2023-05-05 }
  34. {$ifdef freebsd}
  35. {$ifndef DISABLE_ICONV_LIBC}
  36. {$define iconv_is_in_libc}
  37. {$endif}
  38. {$endif}
  39. {$ifndef iconv_is_in_libc}
  40. {$if defined(haiku)}
  41. {$linklib textencoding}
  42. {$else}
  43. {$linklib iconv}
  44. {$endif}
  45. {$define useiconv}
  46. {$endif not iconv_is_in_libc}
  47. {$i rtldefs.inc}
  48. Uses
  49. BaseUnix,
  50. ctypes,
  51. unix,
  52. unixtype,
  53. initc,
  54. dynlibs,
  55. unixcp;
  56. Const
  57. {$ifndef useiconv}
  58. libiconvname='c'; // is in libc under Linux.
  59. libprefix='lib';
  60. {$else}
  61. {$ifdef haiku}
  62. libiconvname='textencoding'; // is in libtextencoding under Haiku
  63. libprefix='lib';
  64. {$else}
  65. {$if defined(darwin) or defined(aix)}
  66. libiconvname='libiconv';
  67. libprefix='';
  68. {$else}
  69. libiconvname='iconv';
  70. libprefix='lib';
  71. {$endif}
  72. {$endif}
  73. {$endif}
  74. { helper functions from libc }
  75. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  76. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  77. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  78. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  79. {$ifdef netbsd}
  80. { NetBSD has a new setlocale function defined in /usr/include/locale.h
  81. that should be used }
  82. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name '__setlocale50';
  83. {$else}
  84. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  85. {$endif}
  86. {$if not(defined(beos) and not defined(haiku))}
  87. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  88. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  89. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  90. {$else beos}
  91. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  92. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  93. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  94. {$endif beos}
  95. const
  96. {$if defined(linux)}
  97. __LC_CTYPE = 0;
  98. LC_ALL = 6;
  99. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  100. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  101. CODESET = _NL_CTYPE_CODESET_NAME;
  102. {$elseif defined(darwin)}
  103. CODESET = 0;
  104. LC_ALL = 0;
  105. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  106. __LC_CTYPE = 0;
  107. LC_ALL = 0;
  108. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  109. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  110. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  111. {$elseif defined(solaris)}
  112. {$define ACCEPT_646}
  113. CODESET=49;
  114. LC_ALL = 6;
  115. {$elseif defined(beos)}
  116. {$ifdef haiku}
  117. CODESET= 0; // Checked for Haiku
  118. LC_ALL = 0; // Checked for Haiku
  119. {$else}
  120. {$warning check correct value for BeOS}
  121. CODESET=49;
  122. LC_ALL = 6; // Checked for BeOS
  123. {$endif}
  124. ESysEILSEQ = EILSEQ;
  125. {$elseif defined(OpenBSD)}
  126. {$define ACCEPT_646}
  127. CODESET = 51;
  128. LC_ALL = 0;
  129. {$elseif defined(NetBSD)}
  130. {$define ACCEPT_646}
  131. CODESET = 51;
  132. LC_ALL = 0;
  133. {$elseif defined(aix)}
  134. CODESET = 49;
  135. LC_ALL = -1;
  136. {$elseif defined(dragonfly)}
  137. CODESET = 0;
  138. LC_ALL = 0;
  139. __LC_CTYPE = 0;
  140. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  141. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  142. {$else not aix}
  143. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  144. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  145. {$endif}
  146. { unicode encoding name }
  147. {$ifdef FPC_LITTLE_ENDIAN}
  148. unicode_encoding2 = 'UTF-16LE';
  149. unicode_encoding4 = 'UCS-4LE';
  150. {$else FPC_LITTLE_ENDIAN}
  151. {$ifdef AIX}
  152. unicode_encoding2 = 'UTF-16';
  153. {$else AIX}
  154. unicode_encoding2 = 'UTF-16BE';
  155. unicode_encoding4 = 'UCS-4BE';
  156. {$endif AIX}
  157. {$endif FPC_LITTLE_ENDIAN}
  158. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  159. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  160. { that's a libc macro mapped to internal functions/variables }
  161. { and thus not a stable external API on systems where libc }
  162. { breaks backwards compatibility every now and then }
  163. MB_CUR_MAX = 10;
  164. { Requests for iconvctl }
  165. ICONV_TRIVIALP = 0; // int *argument
  166. ICONV_GET_TRANSLITERATE = 1; // int *argument
  167. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  168. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  169. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  170. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  171. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  172. type
  173. piconv_t = ^iconv_t;
  174. iconv_t = pointer;
  175. nl_item = cint;
  176. {$ifdef haiku}
  177. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'root' name 'nl_langinfo';
  178. {$else}
  179. {$ifndef beos}
  180. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  181. {$endif}
  182. {$endif}
  183. {$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
  184. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  185. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  186. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  187. const
  188. iconvctlname='iconvctl';
  189. {$else}
  190. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  191. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  192. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  193. const
  194. iconvctlname='libiconvctl';
  195. {$endif}
  196. var
  197. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  198. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  199. threadvar
  200. iconv_ansi2wide,
  201. iconv_wide2ansi : iconv_t;
  202. { since we cache the iconv_t converters, we have to do the same
  203. for the DefaultSystemCodePage variable since if it changes, we
  204. have to re-initialize the converters too. We can't do that via
  205. a callback in the widestring manager because DefaultSystemCodePage
  206. is not a threadvar and we can't automatically change this in all
  207. threads }
  208. current_DefaultSystemCodePage: TSystemCodePage;
  209. procedure InitThread;
  210. var
  211. transliterate: cint;
  212. iconvindex: longint;
  213. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  214. iconvname: rawbytestring;
  215. {$endif}
  216. begin
  217. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  218. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  219. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  220. if iconvindex<>-1 then
  221. iconvname:=UnixCpMap[iconvindex].name
  222. else
  223. { default to UTF-8 on Unix platforms }
  224. iconvname:='UTF-8';
  225. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  226. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  227. {$else}
  228. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  229. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  230. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  231. {$endif}
  232. if assigned(iconvctl) and
  233. (iconv_wide2ansi<>iconv_t(-1)) then
  234. begin
  235. transliterate:=1;
  236. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  237. end;
  238. end;
  239. procedure FiniThread;
  240. begin
  241. if (iconv_wide2ansi <> iconv_t(-1)) then
  242. iconv_close(iconv_wide2ansi);
  243. if (iconv_ansi2wide <> iconv_t(-1)) then
  244. iconv_close(iconv_ansi2wide);
  245. end;
  246. {$if defined(beos) and not defined(haiku)}
  247. function nl_langinfo(__item:nl_item):pchar;
  248. begin
  249. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  250. // Now implement the minimum required to correctly initialize WideString support
  251. case __item of
  252. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  253. else
  254. begin
  255. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  256. Result := '';
  257. end;
  258. end;
  259. end;
  260. {$endif}
  261. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
  262. var
  263. iconvindex: longint;
  264. begin
  265. { TODO: add caching (then we also don't need separate code for
  266. the default system page and other ones)
  267. -- typecasting an ansistring function result to pchar is
  268. unsafe normally, but these are constant strings -> no
  269. problem }
  270. open_iconv_for_cps:=iconv_t(-1);
  271. iconvindex:=GetCodepageData(cp);
  272. if iconvindex=-1 then
  273. exit;
  274. repeat
  275. if cp_is_from then
  276. open_iconv_for_cps:=iconv_open(otherencoding,pchar(UnixCpMap[iconvindex].name))
  277. else
  278. open_iconv_for_cps:=iconv_open(pchar(UnixCpMap[iconvindex].name),otherencoding);
  279. inc(iconvindex);
  280. until (open_iconv_for_cps<>iconv_t(-1)) or
  281. (iconvindex>high(UnixCpMap)) or
  282. (UnixCpMap[iconvindex].cp<>cp);
  283. end;
  284. {$ifdef aix}
  285. {$i cwstraix.inc}
  286. {$endif aix}
  287. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  288. var
  289. outlength,
  290. outoffset,
  291. srclen,
  292. outleft : size_t;
  293. use_iconv: iconv_t;
  294. srcpos : pwidechar;
  295. destpos: pchar;
  296. mynil : pchar;
  297. my0 : size_t;
  298. err : longint;
  299. transliterate: cint;
  300. free_iconv: boolean;
  301. {$ifdef aix}
  302. intermediate: rawbytestring;
  303. {$endif aix}
  304. begin
  305. {$ifdef aix}
  306. { AIX libiconv does not support converting cp866 to anything else except
  307. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  308. if cp=866 then
  309. begin
  310. Wide2AnsiMove(source,intermediate,28595,len);
  311. if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
  312. exit;
  313. end;
  314. {$endif aix}
  315. if (cp=DefaultSystemCodePage) then
  316. begin
  317. { update iconv converter in case the DefaultSystemCodePage has been
  318. changed }
  319. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  320. begin
  321. FiniThread;
  322. InitThread;
  323. end;
  324. use_iconv:=iconv_wide2ansi;
  325. free_iconv:=false;
  326. end
  327. else
  328. begin
  329. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  330. if (use_iconv<>iconv_t(-1)) and
  331. assigned(iconvctl) then
  332. begin
  333. transliterate:=1;
  334. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  335. end;
  336. free_iconv:=true;
  337. end;
  338. { unsupported encoding -> default move }
  339. if use_iconv=iconv_t(-1) then
  340. begin
  341. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  342. exit;
  343. end;
  344. mynil:=nil;
  345. my0:=0;
  346. { rought estimation }
  347. setlength(dest,len*3);
  348. outlength:=len*3;
  349. srclen:=len*2;
  350. srcpos:=source;
  351. destpos:=pchar(dest);
  352. outleft:=outlength;
  353. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  354. begin
  355. err:=fpgetCerrno;
  356. case err of
  357. { last character is incomplete sequence }
  358. ESysEINVAL,
  359. { incomplete sequence in the middle }
  360. ESysEILSEQ:
  361. begin
  362. { skip and set to '?' }
  363. inc(srcpos);
  364. dec(srclen,2);
  365. destpos^:='?';
  366. inc(destpos);
  367. dec(outleft);
  368. { reset }
  369. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  370. if err=ESysEINVAL then
  371. break;
  372. end;
  373. ESysE2BIG:
  374. begin
  375. outoffset:=destpos-pchar(dest);
  376. { extend }
  377. setlength(dest,outlength+len*3);
  378. inc(outleft,len*3);
  379. inc(outlength,len*3);
  380. { string could have been moved }
  381. destpos:=pchar(dest)+outoffset;
  382. end;
  383. else
  384. runerror(231);
  385. end;
  386. end;
  387. // truncate string
  388. setlength(dest,length(dest)-outleft);
  389. SetCodePage(dest,cp,false);
  390. if free_iconv then
  391. iconv_close(use_iconv);
  392. end;
  393. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  394. var
  395. outlength,
  396. outoffset,
  397. outleft : size_t;
  398. use_iconv: iconv_t;
  399. srcpos,
  400. destpos: pchar;
  401. mynil : pchar;
  402. my0 : size_t;
  403. err: cint;
  404. iconvindex: longint;
  405. free_iconv: boolean;
  406. {$ifdef aix}
  407. intermediate: rawbytestring;
  408. {$endif aix}
  409. begin
  410. {$ifdef aix}
  411. { AIX libiconv does not support converting cp866 to anything else except
  412. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  413. if cp=866 then
  414. begin
  415. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  416. source:=pchar(intermediate);
  417. end;
  418. {$endif aix}
  419. if (cp=DefaultSystemCodePage) then
  420. begin
  421. { update iconv converter in case the DefaultSystemCodePage has been
  422. changed }
  423. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  424. begin
  425. FiniThread;
  426. InitThread;
  427. end;
  428. use_iconv:=iconv_ansi2wide;
  429. free_iconv:=false;
  430. end
  431. else
  432. begin
  433. { TODO: add caching (then we also don't need separate code for
  434. the default system page and other ones)
  435. -- typecasting an ansistring function result to pchar is
  436. unsafe normally, but these are constant strings -> no
  437. problem }
  438. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  439. free_iconv:=true;
  440. end;
  441. { unsupported encoding -> default move }
  442. if use_iconv=iconv_t(-1) then
  443. begin
  444. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  445. exit;
  446. end;
  447. mynil:=nil;
  448. my0:=0;
  449. // extra space
  450. outlength:=len+1;
  451. setlength(dest,outlength);
  452. srcpos:=source;
  453. destpos:=pchar(dest);
  454. outleft:=outlength*2;
  455. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  456. begin
  457. err:=fpgetCerrno;
  458. case err of
  459. ESysEINVAL,
  460. ESysEILSEQ:
  461. begin
  462. { skip and set to '?' }
  463. inc(srcpos);
  464. dec(len);
  465. pwidechar(destpos)^:='?';
  466. inc(destpos,2);
  467. dec(outleft,2);
  468. { reset }
  469. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  470. if err=ESysEINVAL then
  471. break;
  472. end;
  473. ESysE2BIG:
  474. begin
  475. outoffset:=destpos-pchar(dest);
  476. { extend }
  477. setlength(dest,outlength+len);
  478. inc(outleft,len*2);
  479. inc(outlength,len);
  480. { string could have been moved }
  481. destpos:=pchar(dest)+outoffset;
  482. end;
  483. else
  484. runerror(231);
  485. end;
  486. end;
  487. // truncate string
  488. setlength(dest,length(dest)-outleft div 2);
  489. if free_iconv then
  490. iconv_close(use_iconv);
  491. end;
  492. function LowerWideString(const s : WideString) : WideString;
  493. var
  494. i : SizeInt;
  495. begin
  496. SetLength(result,length(s));
  497. for i:=0 to length(s)-1 do
  498. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  499. end;
  500. function UpperWideString(const s : WideString) : WideString;
  501. var
  502. i : SizeInt;
  503. begin
  504. SetLength(result,length(s));
  505. for i:=0 to length(s)-1 do
  506. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  507. end;
  508. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  509. begin
  510. if (len>length(s)) then
  511. if (length(s) < 10*256) then
  512. setlength(s,length(s)+10)
  513. else
  514. setlength(s,length(s)+length(s) shr 8);
  515. end;
  516. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  517. begin
  518. EnsureAnsiLen(s,index);
  519. pchar(@s[index])^:=c;
  520. inc(index);
  521. end;
  522. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  523. {$if not(defined(beos) and not defined(haiku))}
  524. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  525. {$else not beos}
  526. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  527. {$endif beos}
  528. var
  529. p : pchar;
  530. mblen : size_t;
  531. begin
  532. { we know that s is unique -> avoid uniquestring calls}
  533. p:=@s[index];
  534. if (nc<=127) then
  535. ConcatCharToAnsiStr(char(nc),s,index)
  536. else
  537. begin
  538. EnsureAnsiLen(s,index+MB_CUR_MAX);
  539. {$if not(defined(beos) and not defined(haiku))}
  540. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  541. {$else not beos}
  542. mblen:=wctomb(p,wchar_t(nc));
  543. {$endif not beos}
  544. if (mblen<>size_t(-1)) then
  545. inc(index,mblen)
  546. else
  547. begin
  548. { invalid wide char }
  549. p^:='?';
  550. inc(index);
  551. end;
  552. end;
  553. end;
  554. function LowerAnsiString(const s : AnsiString) : AnsiString;
  555. var
  556. i, slen,
  557. resindex : SizeInt;
  558. mblen : size_t;
  559. {$if not(defined(beos) and not defined(haiku))}
  560. ombstate,
  561. nmbstate : mbstate_t;
  562. {$endif beos}
  563. wc : wchar_t;
  564. begin
  565. {$if not(defined(beos) and not defined(haiku))}
  566. fillchar(ombstate,sizeof(ombstate),0);
  567. fillchar(nmbstate,sizeof(nmbstate),0);
  568. {$endif beos}
  569. slen:=length(s);
  570. SetLength(result,slen+10);
  571. i:=1;
  572. resindex:=1;
  573. while (i<=slen) do
  574. begin
  575. if (s[i]<=#127) then
  576. begin
  577. wc:=wchar_t(s[i]);
  578. mblen:= 1;
  579. end
  580. else
  581. {$if not(defined(beos) and not defined(haiku))}
  582. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  583. {$else not beos}
  584. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  585. {$endif not beos}
  586. case mblen of
  587. size_t(-2):
  588. begin
  589. { partial invalid character, copy literally }
  590. while (i<=slen) do
  591. begin
  592. ConcatCharToAnsiStr(s[i],result,resindex);
  593. inc(i);
  594. end;
  595. end;
  596. size_t(-1), 0:
  597. begin
  598. { invalid or null character }
  599. ConcatCharToAnsiStr(s[i],result,resindex);
  600. inc(i);
  601. end;
  602. else
  603. begin
  604. { a valid sequence }
  605. { even if mblen = 1, the lowercase version may have a }
  606. { different length }
  607. { We can't do anything special if wchar_t is 16 bit... }
  608. {$if not(defined(beos) and not defined(haiku))}
  609. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  610. {$else not beos}
  611. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  612. {$endif not beos}
  613. inc(i,mblen);
  614. end;
  615. end;
  616. end;
  617. SetLength(result,resindex-1);
  618. end;
  619. function UpperAnsiString(const s : AnsiString) : AnsiString;
  620. var
  621. i, slen,
  622. resindex : SizeInt;
  623. mblen : size_t;
  624. {$if not(defined(beos) and not defined(haiku))}
  625. ombstate,
  626. nmbstate : mbstate_t;
  627. {$endif beos}
  628. wc : wchar_t;
  629. begin
  630. {$if not(defined(beos) and not defined(haiku))}
  631. fillchar(ombstate,sizeof(ombstate),0);
  632. fillchar(nmbstate,sizeof(nmbstate),0);
  633. {$endif beos}
  634. slen:=length(s);
  635. SetLength(result,slen+10);
  636. i:=1;
  637. resindex:=1;
  638. while (i<=slen) do
  639. begin
  640. if (s[i]<=#127) then
  641. begin
  642. wc:=wchar_t(s[i]);
  643. mblen:= 1;
  644. end
  645. else
  646. {$if not(defined(beos) and not defined(haiku))}
  647. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  648. {$else not beos}
  649. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  650. {$endif beos}
  651. case mblen of
  652. size_t(-2):
  653. begin
  654. { partial invalid character, copy literally }
  655. while (i<=slen) do
  656. begin
  657. ConcatCharToAnsiStr(s[i],result,resindex);
  658. inc(i);
  659. end;
  660. end;
  661. size_t(-1), 0:
  662. begin
  663. { invalid or null character }
  664. ConcatCharToAnsiStr(s[i],result,resindex);
  665. inc(i);
  666. end;
  667. else
  668. begin
  669. { a valid sequence }
  670. { even if mblen = 1, the uppercase version may have a }
  671. { different length }
  672. { We can't do anything special if wchar_t is 16 bit... }
  673. {$if not(defined(beos) and not defined(haiku))}
  674. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  675. {$else not beos}
  676. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  677. {$endif not beos}
  678. inc(i,mblen);
  679. end;
  680. end;
  681. end;
  682. SetLength(result,resindex-1);
  683. end;
  684. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  685. var
  686. i, slen,
  687. destindex : SizeInt;
  688. uch : UCS4Char;
  689. begin
  690. slen:=length(s);
  691. setlength(result,slen+1);
  692. i:=1;
  693. destindex:=0;
  694. while (i<=slen) do
  695. begin
  696. uch:=UCS4Char(s[i]);
  697. if (uch=0) then
  698. result[destindex]:=32
  699. else if (uch<=$d7ff) or (uch>=$e000) then
  700. result[destindex]:=uch
  701. else if (uch<=$dbff) and
  702. (i<slen) and
  703. (s[i+1]>=#$dc00) and
  704. (s[i+1]<=#$dfff) then
  705. begin
  706. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  707. inc(i);
  708. end
  709. else { invalid surrogate pair }
  710. result[destindex]:=uch;
  711. inc(i);
  712. inc(destindex);
  713. end;
  714. result[destindex]:=UCS4Char(0);
  715. { Trimming length in this particular case is just a waste of time,
  716. because result will be interpreted as null-terminated and discarded
  717. almost immediately }
  718. end;
  719. function CompareWideString(const s1, s2 : WideString; Options : TCompareOptions) : PtrInt;
  720. {$if not(defined (aix) and defined(cpupowerpc32))}
  721. var
  722. hs1,hs2 : UCS4String;
  723. us1,us2 : WideString;
  724. begin
  725. { wcscoll interprets null chars as end-of-string -> filter out }
  726. if coIgnoreCase in Options then
  727. begin
  728. us1:=UpperWideString(s1);
  729. us2:=UpperWideString(s2);
  730. end
  731. else
  732. begin
  733. us1:=s1;
  734. us2:=s2;
  735. end;
  736. hs1:=WideStringToUCS4StringNoNulls(us1);
  737. hs2:=WideStringToUCS4StringNoNulls(us2);
  738. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  739. end;
  740. {$else}
  741. { AIX/PPC32 has a 16 bit wchar_t }
  742. var
  743. i, len: longint;
  744. us1,us2 : WideString;
  745. hs1, hs2: array of widechar;
  746. begin
  747. if coIgnoreCase in Options then
  748. begin
  749. us1:=UpperWideString(s1);
  750. us2:=UpperWideString(s2);
  751. end
  752. else
  753. begin
  754. us1:=s1;
  755. us2:=s2;
  756. end;
  757. len:=length(us1);
  758. setlength(hs1,len+1);
  759. for i:=1 to len do
  760. if us1[i]<>#0 then
  761. hs1[i-1]:=us1[i]
  762. else
  763. hs1[i-1]:=#32;
  764. hs1[len]:=#0;
  765. len:=length(us2);
  766. setlength(hs2,len+1);
  767. for i:=1 to len do
  768. if us2[i]<>#0 then
  769. hs2[i-1]:=us2[i]
  770. else
  771. hs2[i-1]:=#32;
  772. hs2[len]:=#0;
  773. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  774. end;
  775. {$endif}
  776. { return value: number of code points in the string. Whenever an invalid
  777. code point is encountered, all characters part of this invalid code point
  778. are considered to form one "character" and the next character is
  779. considered to be the start of a new (possibly also invalid) code point }
  780. function CharLengthPChar(const Str: PChar): PtrInt;
  781. var
  782. nextlen: ptrint;
  783. s: pchar;
  784. {$if not(defined(beos) and not defined(haiku))}
  785. mbstate: mbstate_t;
  786. {$endif not beos}
  787. begin
  788. result:=0;
  789. s:=str;
  790. {$if not(defined(beos) and not defined(haiku))}
  791. fillchar(mbstate,sizeof(mbstate),0);
  792. {$endif not beos}
  793. repeat
  794. {$if defined(beos) and not defined(haiku)}
  795. nextlen:=ptrint(mblen(s,MB_CUR_MAX));
  796. {$else beos}
  797. nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate));
  798. {$endif beos}
  799. { skip invalid/incomplete sequences }
  800. if (nextlen<0) then
  801. nextlen:=1;
  802. inc(result,1);
  803. inc(s,nextlen);
  804. until (nextlen=0);
  805. end;
  806. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  807. {$if not(defined(beos) and not defined(haiku))}
  808. var
  809. mbstate: mbstate_t;
  810. {$endif not beos}
  811. begin
  812. {$if defined(beos) and not defined(haiku)}
  813. result:=ptrint(mblen(str,maxlookahead));
  814. {$else beos}
  815. fillchar(mbstate,sizeof(mbstate),0);
  816. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  817. { mbrlen can also return -2 for "incomplete but potially valid character
  818. and data has been processed" }
  819. if result<0 then
  820. result:=-1;
  821. {$endif beos}
  822. end;
  823. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  824. var
  825. a,b: pchar;
  826. i: PtrInt;
  827. begin
  828. if not(canmodifys1) then
  829. getmem(a,len1+1)
  830. else
  831. a:=s1;
  832. for i:=0 to len1-1 do
  833. if s1[i]<>#0 then
  834. a[i]:=s1[i]
  835. else
  836. a[i]:=#32;
  837. a[len1]:=#0;
  838. if not(canmodifys2) then
  839. getmem(b,len2+1)
  840. else
  841. b:=s2;
  842. for i:=0 to len2-1 do
  843. if s2[i]<>#0 then
  844. b[i]:=s2[i]
  845. else
  846. b[i]:=#32;
  847. b[len2]:=#0;
  848. result:=strcoll(a,b);
  849. if not(canmodifys1) then
  850. freemem(a);
  851. if not(canmodifys2) then
  852. freemem(b);
  853. end;
  854. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  855. begin
  856. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  857. end;
  858. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  859. begin
  860. result:=strcoll(s1,s2);
  861. end;
  862. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  863. var
  864. a, b: AnsiString;
  865. begin
  866. a:=UpperAnsistring(s1);
  867. b:=UpperAnsistring(s2);
  868. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  869. end;
  870. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  871. begin
  872. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  873. end;
  874. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  875. var
  876. a, b: pchar;
  877. begin
  878. if (maxlen=0) then
  879. exit(0);
  880. if (s1[maxlen]<>#0) then
  881. begin
  882. getmem(a,maxlen+1);
  883. move(s1^,a^,maxlen);
  884. a[maxlen]:=#0;
  885. end
  886. else
  887. a:=s1;
  888. if (s2[maxlen]<>#0) then
  889. begin
  890. getmem(b,maxlen+1);
  891. move(s2^,b^,maxlen);
  892. b[maxlen]:=#0;
  893. end
  894. else
  895. b:=s2;
  896. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  897. if (a<>s1) then
  898. freemem(a);
  899. if (b<>s2) then
  900. freemem(b);
  901. end;
  902. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  903. var
  904. a, b: ansistring;
  905. begin
  906. if (maxlen=0) then
  907. exit(0);
  908. setlength(a,maxlen);
  909. move(s1^,a[1],maxlen);
  910. setlength(b,maxlen);
  911. move(s2^,b[1],maxlen);
  912. result:=AnsiCompareText(a,b);
  913. end;
  914. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  915. var
  916. newlen: sizeint;
  917. begin
  918. newlen:=length(s);
  919. if newlen>strlen(orgp) then
  920. fpc_rangeerror;
  921. p:=orgp;
  922. if (newlen>0) then
  923. move(s[1],p[0],newlen);
  924. p[newlen]:=#0;
  925. end;
  926. function AnsiStrLower(Str: PChar): PChar;
  927. var
  928. temp: ansistring;
  929. begin
  930. temp:=loweransistring(str);
  931. ansi2pchar(temp,str,result);
  932. end;
  933. function AnsiStrUpper(Str: PChar): PChar;
  934. var
  935. temp: ansistring;
  936. begin
  937. temp:=upperansistring(str);
  938. ansi2pchar(temp,str,result);
  939. end;
  940. function envvarset(const varname: pchar): boolean;
  941. var
  942. varval: pchar;
  943. begin
  944. varval:=fpgetenv(varname);
  945. result:=
  946. assigned(varval) and
  947. (varval[0]<>#0);
  948. end;
  949. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  950. var
  951. langinfo: pchar;
  952. begin
  953. {$ifdef FPCRTL_FILESYSTEM_UTF8}
  954. if stdcp=scpFileSystemSingleByte then
  955. begin
  956. result:=CP_UTF8;
  957. exit;
  958. end;
  959. {$endif}
  960. { if none of the relevant LC_* environment variables are set, fall back to
  961. UTF-8 (this happens under some versions of OS X for GUI applications, which
  962. otherwise get CP_ASCII) }
  963. if envvarset('LC_ALL') or
  964. envvarset('LC_CTYPE') or
  965. envvarset('LANG') then
  966. begin
  967. langinfo:=nl_langinfo(CODESET);
  968. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  969. that causes it to return an empty string of UTF-8 locales
  970. -> patch up (and in general, UTF-8 is a good default on
  971. Unix platforms) }
  972. if not assigned(langinfo) or
  973. (langinfo^=#0) then
  974. langinfo:='UTF-8';
  975. Result:=GetCodepageByName(ansistring(langinfo));
  976. end
  977. else
  978. Result:=unixcp.GetSystemCodepage;
  979. end;
  980. {$ifdef FPC_HAS_CPSTRING}
  981. procedure SetStdIOCodePage(var T: Text); inline;
  982. begin
  983. case TextRec(T).Mode of
  984. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  985. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  986. end;
  987. end;
  988. procedure SetStdIOCodePages; inline;
  989. begin
  990. SetStdIOCodePage(Input);
  991. SetStdIOCodePage(Output);
  992. SetStdIOCodePage(ErrOutput);
  993. SetStdIOCodePage(StdOut);
  994. SetStdIOCodePage(StdErr);
  995. end;
  996. {$endif FPC_HAS_CPSTRING}
  997. var
  998. OrgWideStringManager: TUnicodeStringManager;
  999. Procedure SetCWideStringManager;
  1000. Var
  1001. CWideStringManager : TUnicodeStringManager;
  1002. begin
  1003. GetUnicodeStringManager(OrgWideStringManager);
  1004. CWideStringManager:=OrgWideStringManager;
  1005. With CWideStringManager do
  1006. begin
  1007. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  1008. Ansi2WideMoveProc:=@Ansi2WideMove;
  1009. UpperWideStringProc:=@UpperWideString;
  1010. LowerWideStringProc:=@LowerWideString;
  1011. CompareWideStringProc:=@CompareWideString;
  1012. // CompareTextWideStringProc:=@CompareTextWideString;
  1013. CharLengthPCharProc:=@CharLengthPChar;
  1014. CodePointLengthProc:=@CodePointLength;
  1015. UpperAnsiStringProc:=@UpperAnsiString;
  1016. LowerAnsiStringProc:=@LowerAnsiString;
  1017. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  1018. CompareTextAnsiStringProc:=@AnsiCompareText;
  1019. StrCompAnsiStringProc:=@StrCompAnsi;
  1020. StrICompAnsiStringProc:=@AnsiStrIComp;
  1021. StrLCompAnsiStringProc:=@AnsiStrLComp;
  1022. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  1023. StrLowerAnsiStringProc:=@AnsiStrLower;
  1024. StrUpperAnsiStringProc:=@AnsiStrUpper;
  1025. ThreadInitProc:=@InitThread;
  1026. ThreadFiniProc:=@FiniThread;
  1027. { Unicode }
  1028. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  1029. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  1030. UpperUnicodeStringProc:=@UpperWideString;
  1031. LowerUnicodeStringProc:=@LowerWideString;
  1032. CompareUnicodeStringProc:=@CompareWideString;
  1033. { CodePage }
  1034. GetStandardCodePageProc:=@GetStandardCodePage;
  1035. end;
  1036. SetUnicodeStringManager(CWideStringManager);
  1037. end;
  1038. var
  1039. iconvlib:TLibHandle;
  1040. initialization
  1041. SetCWideStringManager;
  1042. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  1043. { with the information from the environment variables according to POSIX }
  1044. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  1045. setlocale(LC_ALL,'');
  1046. { load iconvctl function }
  1047. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  1048. if iconvlib<>0 then
  1049. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  1050. { set the DefaultSystemCodePage }
  1051. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  1052. DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
  1053. DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
  1054. {$ifdef FPC_HAS_CPSTRING}
  1055. SetStdIOCodePages;
  1056. {$endif FPC_HAS_CPSTRING}
  1057. { init conversion tables for main program }
  1058. InitThread;
  1059. finalization
  1060. { fini conversion tables for main program }
  1061. FiniThread;
  1062. { unload iconv library }
  1063. if iconvlib<>0 then
  1064. FreeLibrary(iconvlib);
  1065. { restore previous (probably default) widestring manager so that subsequent calls
  1066. into the widestring manager won't trigger the finalized functionality }
  1067. SetWideStringManager(OrgWideStringManager);
  1068. end.