cwstring.pp 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. {$if not defined(linux) and not defined(solaris)} // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(haiku)}
  22. {$linklib textencoding}
  23. {$else}
  24. {$linklib iconv}
  25. {$endif}
  26. {$define useiconv}
  27. {$endif linux}
  28. Uses
  29. BaseUnix,
  30. ctypes,
  31. unix,
  32. unixtype,
  33. initc,
  34. dynlibs,
  35. unixcp;
  36. Const
  37. {$ifndef useiconv}
  38. libiconvname='c'; // is in libc under Linux.
  39. libprefix='lib';
  40. {$else}
  41. {$ifdef haiku}
  42. libiconvname='textencoding'; // is in libtextencoding under Haiku
  43. libprefix='lib';
  44. {$else}
  45. {$if defined(darwin) or defined(aix)}
  46. libiconvname='libiconv';
  47. libprefix='';
  48. {$else}
  49. libiconvname='iconv';
  50. libprefix='lib';
  51. {$endif}
  52. {$endif}
  53. {$endif}
  54. { helper functions from libc }
  55. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  56. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  57. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  58. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  59. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  60. {$ifndef beos}
  61. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  62. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  63. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  64. {$else beos}
  65. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  66. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  67. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  68. {$endif beos}
  69. const
  70. {$if defined(linux)}
  71. __LC_CTYPE = 0;
  72. LC_ALL = 6;
  73. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  74. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  75. CODESET = _NL_CTYPE_CODESET_NAME;
  76. {$elseif defined(darwin)}
  77. CODESET = 0;
  78. LC_ALL = 0;
  79. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  80. __LC_CTYPE = 0;
  81. LC_ALL = 0;
  82. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  83. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  84. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  85. {$elseif defined(solaris)}
  86. {$define ACCEPT_646}
  87. CODESET=49;
  88. LC_ALL = 6;
  89. {$elseif defined(beos)}
  90. {$warning check correct value for BeOS}
  91. CODESET=49;
  92. {$ifdef haiku}
  93. LC_ALL = 0; // Checked for Haiku
  94. {$else}
  95. LC_ALL = 6; // Checked for BeOS
  96. {$endif}
  97. ESysEILSEQ = EILSEQ;
  98. {$elseif defined(OpenBSD)}
  99. {$define ACCEPT_646}
  100. CODESET = 51;
  101. LC_ALL = 0;
  102. {$elseif defined(NetBSD)}
  103. {$define ACCEPT_646}
  104. CODESET = 51;
  105. LC_ALL = 0;
  106. {$elseif defined(aix)}
  107. CODESET = 49;
  108. LC_ALL = -1;
  109. {$else not aix}
  110. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  111. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  112. {$endif}
  113. { unicode encoding name }
  114. {$ifdef FPC_LITTLE_ENDIAN}
  115. unicode_encoding2 = 'UTF-16LE';
  116. unicode_encoding4 = 'UCS-4LE';
  117. {$else FPC_LITTLE_ENDIAN}
  118. {$ifdef AIX}
  119. unicode_encoding2 = 'UTF-16';
  120. {$else AIX}
  121. unicode_encoding2 = 'UTF-16BE';
  122. unicode_encoding4 = 'UCS-4BE';
  123. {$endif AIX}
  124. {$endif FPC_LITTLE_ENDIAN}
  125. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  126. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  127. { that's a libc macro mapped to internal functions/variables }
  128. { and thus not a stable external API on systems where libc }
  129. { breaks backwards compatibility every now and then }
  130. MB_CUR_MAX = 10;
  131. { Requests for iconvctl }
  132. ICONV_TRIVIALP = 0; // int *argument
  133. ICONV_GET_TRANSLITERATE = 1; // int *argument
  134. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  135. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  136. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  137. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  138. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  139. type
  140. piconv_t = ^iconv_t;
  141. iconv_t = pointer;
  142. nl_item = cint;
  143. {$ifdef haiku}
  144. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'root' name 'nl_langinfo';
  145. {$else}
  146. {$ifndef beos}
  147. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  148. {$endif}
  149. {$endif}
  150. {$if (not defined(bsd) and not defined(beos)) or (defined(darwin) and not defined(cpupowerpc32))}
  151. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  152. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  153. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  154. const
  155. iconvctlname='iconvctl';
  156. {$else}
  157. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  158. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  159. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  160. const
  161. iconvctlname='libiconvctl';
  162. {$endif}
  163. var
  164. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  165. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  166. threadvar
  167. iconv_ansi2wide,
  168. iconv_wide2ansi : iconv_t;
  169. { since we cache the iconv_t converters, we have to do the same
  170. for the DefaultSystemCodePage variable since if it changes, we
  171. have to re-initialize the converters too. We can't do that via
  172. a callback in the widestring manager because DefaultSystemCodePage
  173. is not a threadvar and we can't automatically change this in all
  174. threads }
  175. current_DefaultSystemCodePage: TSystemCodePage;
  176. procedure InitThread;
  177. var
  178. transliterate: cint;
  179. iconvindex: longint;
  180. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  181. iconvname: rawbytestring;
  182. {$endif}
  183. begin
  184. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  185. {$if not(defined(darwin) and defined(cpuarm)) and not defined(iphonesim)}
  186. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  187. if iconvindex<>-1 then
  188. iconvname:=UnixCpMap[iconvindex].name
  189. else
  190. { default to UTF-8 on Unix platforms }
  191. iconvname:='UTF-8';
  192. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  193. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  194. {$else}
  195. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  196. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  197. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  198. {$endif}
  199. if assigned(iconvctl) and
  200. (iconv_wide2ansi<>iconv_t(-1)) then
  201. begin
  202. transliterate:=1;
  203. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  204. end;
  205. end;
  206. procedure FiniThread;
  207. begin
  208. if (iconv_wide2ansi <> iconv_t(-1)) then
  209. iconv_close(iconv_wide2ansi);
  210. if (iconv_ansi2wide <> iconv_t(-1)) then
  211. iconv_close(iconv_ansi2wide);
  212. end;
  213. {$if defined(beos) and not defined(haiku)}
  214. function nl_langinfo(__item:nl_item):pchar;
  215. begin
  216. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  217. // Now implement the minimum required to correctly initialize WideString support
  218. case __item of
  219. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  220. else
  221. begin
  222. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  223. Result := '';
  224. end;
  225. end;
  226. end;
  227. {$endif}
  228. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
  229. var
  230. iconvindex: longint;
  231. begin
  232. { TODO: add caching (then we also don't need separate code for
  233. the default system page and other ones)
  234. -- typecasting an ansistring function result to pchar is
  235. unsafe normally, but these are constant strings -> no
  236. problem }
  237. open_iconv_for_cps:=iconv_t(-1);
  238. iconvindex:=GetCodepageData(cp);
  239. if iconvindex=-1 then
  240. exit;
  241. repeat
  242. if cp_is_from then
  243. open_iconv_for_cps:=iconv_open(otherencoding,pchar(UnixCpMap[iconvindex].name))
  244. else
  245. open_iconv_for_cps:=iconv_open(pchar(UnixCpMap[iconvindex].name),otherencoding);
  246. inc(iconvindex);
  247. until (open_iconv_for_cps<>iconv_t(-1)) or
  248. (iconvindex>high(UnixCpMap)) or
  249. (UnixCpMap[iconvindex].cp<>cp);
  250. end;
  251. {$ifdef aix}
  252. {$i cwstraix.inc}
  253. {$endif aix}
  254. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  255. var
  256. outlength,
  257. outoffset,
  258. srclen,
  259. outleft : size_t;
  260. use_iconv: iconv_t;
  261. srcpos : pwidechar;
  262. destpos: pchar;
  263. mynil : pchar;
  264. my0 : size_t;
  265. err : longint;
  266. transliterate: cint;
  267. free_iconv: boolean;
  268. {$ifdef aix}
  269. intermediate: rawbytestring;
  270. {$endif aix}
  271. begin
  272. {$ifdef aix}
  273. { AIX libiconv does not support converting cp866 to anything else except
  274. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  275. if cp=866 then
  276. begin
  277. Wide2AnsiMove(source,intermediate,28595,len);
  278. if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
  279. exit;
  280. end;
  281. {$endif aix}
  282. if (cp=DefaultSystemCodePage) then
  283. begin
  284. { update iconv converter in case the DefaultSystemCodePage has been
  285. changed }
  286. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  287. begin
  288. FiniThread;
  289. InitThread;
  290. end;
  291. use_iconv:=iconv_wide2ansi;
  292. free_iconv:=false;
  293. end
  294. else
  295. begin
  296. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  297. if (use_iconv<>iconv_t(-1)) and
  298. assigned(iconvctl) then
  299. begin
  300. transliterate:=1;
  301. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  302. end;
  303. free_iconv:=true;
  304. end;
  305. { unsupported encoding -> default move }
  306. if use_iconv=iconv_t(-1) then
  307. begin
  308. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  309. exit;
  310. end;
  311. mynil:=nil;
  312. my0:=0;
  313. { rought estimation }
  314. setlength(dest,len*3);
  315. outlength:=len*3;
  316. srclen:=len*2;
  317. srcpos:=source;
  318. destpos:=pchar(dest);
  319. outleft:=outlength;
  320. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  321. begin
  322. err:=fpgetCerrno;
  323. case err of
  324. { last character is incomplete sequence }
  325. ESysEINVAL,
  326. { incomplete sequence in the middle }
  327. ESysEILSEQ:
  328. begin
  329. { skip and set to '?' }
  330. inc(srcpos);
  331. dec(srclen,2);
  332. destpos^:='?';
  333. inc(destpos);
  334. dec(outleft);
  335. { reset }
  336. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  337. if err=ESysEINVAL then
  338. break;
  339. end;
  340. ESysE2BIG:
  341. begin
  342. outoffset:=destpos-pchar(dest);
  343. { extend }
  344. setlength(dest,outlength+len*3);
  345. inc(outleft,len*3);
  346. inc(outlength,len*3);
  347. { string could have been moved }
  348. destpos:=pchar(dest)+outoffset;
  349. end;
  350. else
  351. runerror(231);
  352. end;
  353. end;
  354. // truncate string
  355. setlength(dest,length(dest)-outleft);
  356. SetCodePage(dest,cp,false);
  357. if free_iconv then
  358. iconv_close(use_iconv);
  359. end;
  360. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  361. var
  362. outlength,
  363. outoffset,
  364. outleft : size_t;
  365. use_iconv: iconv_t;
  366. srcpos,
  367. destpos: pchar;
  368. mynil : pchar;
  369. my0 : size_t;
  370. err: cint;
  371. iconvindex: longint;
  372. free_iconv: boolean;
  373. {$ifdef aix}
  374. intermediate: rawbytestring;
  375. {$endif aix}
  376. begin
  377. {$ifdef aix}
  378. { AIX libiconv does not support converting cp866 to anything else except
  379. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  380. if cp=866 then
  381. begin
  382. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  383. source:=pchar(intermediate);
  384. end;
  385. {$endif aix}
  386. if (cp=DefaultSystemCodePage) then
  387. begin
  388. { update iconv converter in case the DefaultSystemCodePage has been
  389. changed }
  390. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  391. begin
  392. FiniThread;
  393. InitThread;
  394. end;
  395. use_iconv:=iconv_ansi2wide;
  396. free_iconv:=false;
  397. end
  398. else
  399. begin
  400. { TODO: add caching (then we also don't need separate code for
  401. the default system page and other ones)
  402. -- typecasting an ansistring function result to pchar is
  403. unsafe normally, but these are constant strings -> no
  404. problem }
  405. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  406. free_iconv:=true;
  407. end;
  408. { unsupported encoding -> default move }
  409. if use_iconv=iconv_t(-1) then
  410. begin
  411. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  412. exit;
  413. end;
  414. mynil:=nil;
  415. my0:=0;
  416. // extra space
  417. outlength:=len+1;
  418. setlength(dest,outlength);
  419. srcpos:=source;
  420. destpos:=pchar(dest);
  421. outleft:=outlength*2;
  422. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  423. begin
  424. err:=fpgetCerrno;
  425. case err of
  426. ESysEINVAL,
  427. ESysEILSEQ:
  428. begin
  429. { skip and set to '?' }
  430. inc(srcpos);
  431. dec(len);
  432. pwidechar(destpos)^:='?';
  433. inc(destpos,2);
  434. dec(outleft,2);
  435. { reset }
  436. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  437. if err=ESysEINVAL then
  438. break;
  439. end;
  440. ESysE2BIG:
  441. begin
  442. outoffset:=destpos-pchar(dest);
  443. { extend }
  444. setlength(dest,outlength+len);
  445. inc(outleft,len*2);
  446. inc(outlength,len);
  447. { string could have been moved }
  448. destpos:=pchar(dest)+outoffset;
  449. end;
  450. else
  451. runerror(231);
  452. end;
  453. end;
  454. // truncate string
  455. setlength(dest,length(dest)-outleft div 2);
  456. if free_iconv then
  457. iconv_close(use_iconv);
  458. end;
  459. function LowerWideString(const s : WideString) : WideString;
  460. var
  461. i : SizeInt;
  462. begin
  463. SetLength(result,length(s));
  464. for i:=0 to length(s)-1 do
  465. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  466. end;
  467. function UpperWideString(const s : WideString) : WideString;
  468. var
  469. i : SizeInt;
  470. begin
  471. SetLength(result,length(s));
  472. for i:=0 to length(s)-1 do
  473. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  474. end;
  475. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  476. begin
  477. if (len>length(s)) then
  478. if (length(s) < 10*256) then
  479. setlength(s,length(s)+10)
  480. else
  481. setlength(s,length(s)+length(s) shr 8);
  482. end;
  483. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  484. begin
  485. EnsureAnsiLen(s,index);
  486. pchar(@s[index])^:=c;
  487. inc(index);
  488. end;
  489. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  490. {$ifndef beos}
  491. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  492. {$else not beos}
  493. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  494. {$endif beos}
  495. var
  496. p : pchar;
  497. mblen : size_t;
  498. begin
  499. { we know that s is unique -> avoid uniquestring calls}
  500. p:=@s[index];
  501. if (nc<=127) then
  502. ConcatCharToAnsiStr(char(nc),s,index)
  503. else
  504. begin
  505. EnsureAnsiLen(s,index+MB_CUR_MAX);
  506. {$ifndef beos}
  507. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  508. {$else not beos}
  509. mblen:=wctomb(p,wchar_t(nc));
  510. {$endif not beos}
  511. if (mblen<>size_t(-1)) then
  512. inc(index,mblen)
  513. else
  514. begin
  515. { invalid wide char }
  516. p^:='?';
  517. inc(index);
  518. end;
  519. end;
  520. end;
  521. function LowerAnsiString(const s : AnsiString) : AnsiString;
  522. var
  523. i, slen,
  524. resindex : SizeInt;
  525. mblen : size_t;
  526. {$ifndef beos}
  527. ombstate,
  528. nmbstate : mbstate_t;
  529. {$endif beos}
  530. wc : wchar_t;
  531. begin
  532. {$ifndef beos}
  533. fillchar(ombstate,sizeof(ombstate),0);
  534. fillchar(nmbstate,sizeof(nmbstate),0);
  535. {$endif beos}
  536. slen:=length(s);
  537. SetLength(result,slen+10);
  538. i:=1;
  539. resindex:=1;
  540. while (i<=slen) do
  541. begin
  542. if (s[i]<=#127) then
  543. begin
  544. wc:=wchar_t(s[i]);
  545. mblen:= 1;
  546. end
  547. else
  548. {$ifndef beos}
  549. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  550. {$else not beos}
  551. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  552. {$endif not beos}
  553. case mblen of
  554. size_t(-2):
  555. begin
  556. { partial invalid character, copy literally }
  557. while (i<=slen) do
  558. begin
  559. ConcatCharToAnsiStr(s[i],result,resindex);
  560. inc(i);
  561. end;
  562. end;
  563. size_t(-1), 0:
  564. begin
  565. { invalid or null character }
  566. ConcatCharToAnsiStr(s[i],result,resindex);
  567. inc(i);
  568. end;
  569. else
  570. begin
  571. { a valid sequence }
  572. { even if mblen = 1, the lowercase version may have a }
  573. { different length }
  574. { We can't do anything special if wchar_t is 16 bit... }
  575. {$ifndef beos}
  576. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  577. {$else not beos}
  578. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  579. {$endif not beos}
  580. inc(i,mblen);
  581. end;
  582. end;
  583. end;
  584. SetLength(result,resindex-1);
  585. end;
  586. function UpperAnsiString(const s : AnsiString) : AnsiString;
  587. var
  588. i, slen,
  589. resindex : SizeInt;
  590. mblen : size_t;
  591. {$ifndef beos}
  592. ombstate,
  593. nmbstate : mbstate_t;
  594. {$endif beos}
  595. wc : wchar_t;
  596. begin
  597. {$ifndef beos}
  598. fillchar(ombstate,sizeof(ombstate),0);
  599. fillchar(nmbstate,sizeof(nmbstate),0);
  600. {$endif beos}
  601. slen:=length(s);
  602. SetLength(result,slen+10);
  603. i:=1;
  604. resindex:=1;
  605. while (i<=slen) do
  606. begin
  607. if (s[i]<=#127) then
  608. begin
  609. wc:=wchar_t(s[i]);
  610. mblen:= 1;
  611. end
  612. else
  613. {$ifndef beos}
  614. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  615. {$else not beos}
  616. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  617. {$endif beos}
  618. case mblen of
  619. size_t(-2):
  620. begin
  621. { partial invalid character, copy literally }
  622. while (i<=slen) do
  623. begin
  624. ConcatCharToAnsiStr(s[i],result,resindex);
  625. inc(i);
  626. end;
  627. end;
  628. size_t(-1), 0:
  629. begin
  630. { invalid or null character }
  631. ConcatCharToAnsiStr(s[i],result,resindex);
  632. inc(i);
  633. end;
  634. else
  635. begin
  636. { a valid sequence }
  637. { even if mblen = 1, the uppercase version may have a }
  638. { different length }
  639. { We can't do anything special if wchar_t is 16 bit... }
  640. {$ifndef beos}
  641. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  642. {$else not beos}
  643. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  644. {$endif not beos}
  645. inc(i,mblen);
  646. end;
  647. end;
  648. end;
  649. SetLength(result,resindex-1);
  650. end;
  651. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  652. var
  653. i, slen,
  654. destindex : SizeInt;
  655. uch : UCS4Char;
  656. begin
  657. slen:=length(s);
  658. setlength(result,slen+1);
  659. i:=1;
  660. destindex:=0;
  661. while (i<=slen) do
  662. begin
  663. uch:=UCS4Char(s[i]);
  664. if (uch=0) then
  665. result[destindex]:=32
  666. else if (uch<=$d7ff) or (uch>=$e000) then
  667. result[destindex]:=uch
  668. else if (uch<=$dbff) and
  669. (i<slen) and
  670. (s[i+1]>=#$dc00) and
  671. (s[i+1]<=#$dfff) then
  672. begin
  673. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  674. inc(i);
  675. end
  676. else { invalid surrogate pair }
  677. result[destindex]:=uch;
  678. inc(i);
  679. inc(destindex);
  680. end;
  681. result[destindex]:=UCS4Char(0);
  682. { Trimming length in this particular case is just a waste of time,
  683. because result will be interpreted as null-terminated and discarded
  684. almost immediately }
  685. end;
  686. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  687. {$if not(defined (aix) and defined(cpupowerpc32))}
  688. var
  689. hs1,hs2 : UCS4String;
  690. begin
  691. { wcscoll interprets null chars as end-of-string -> filter out }
  692. hs1:=WideStringToUCS4StringNoNulls(s1);
  693. hs2:=WideStringToUCS4StringNoNulls(s2);
  694. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  695. end;
  696. {$else}
  697. { AIX/PPC32 has a 16 bit wchar_t }
  698. var
  699. i, len: longint;
  700. hs1, hs2: array of widechar;
  701. begin
  702. len:=length(s1);
  703. setlength(hs1,len+1);
  704. for i:=1 to len do
  705. if s1[i]<>#0 then
  706. hs1[i-1]:=s1[i]
  707. else
  708. hs1[i-1]:=#32;
  709. hs1[len]:=#0;
  710. len:=length(s2);
  711. setlength(hs2,len+1);
  712. for i:=1 to len do
  713. if s2[i]<>#0 then
  714. hs2[i-1]:=s2[i]
  715. else
  716. hs2[i-1]:=#32;
  717. hs2[len]:=#0;
  718. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  719. end;
  720. {$endif}
  721. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  722. begin
  723. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  724. end;
  725. function CharLengthPChar(const Str: PChar): PtrInt;
  726. var
  727. nextlen: ptrint;
  728. s: pchar;
  729. {$ifndef beos}
  730. mbstate: mbstate_t;
  731. {$endif not beos}
  732. begin
  733. result:=0;
  734. s:=str;
  735. {$ifndef beos}
  736. fillchar(mbstate,sizeof(mbstate),0);
  737. {$endif not beos}
  738. repeat
  739. {$ifdef beos}
  740. nextlen:=ptrint(mblen(str,MB_CUR_MAX));
  741. {$else beos}
  742. nextlen:=ptrint(mbrlen(str,MB_CUR_MAX,@mbstate));
  743. {$endif beos}
  744. { skip invalid/incomplete sequences }
  745. if (nextlen<0) then
  746. nextlen:=1;
  747. inc(result,nextlen);
  748. inc(s,nextlen);
  749. until (nextlen=0);
  750. end;
  751. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  752. {$ifndef beos}
  753. var
  754. mbstate: mbstate_t;
  755. {$endif not beos}
  756. begin
  757. {$ifdef beos}
  758. result:=ptrint(mblen(str,maxlookahead));
  759. {$else beos}
  760. fillchar(mbstate,sizeof(mbstate),0);
  761. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  762. { mbrlen can also return -2 for "incomplete but potially valid character
  763. and data has been processed" }
  764. if result<0 then
  765. result:=-1;
  766. {$endif beos}
  767. end;
  768. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  769. var
  770. a,b: pchar;
  771. i: PtrInt;
  772. begin
  773. if not(canmodifys1) then
  774. getmem(a,len1+1)
  775. else
  776. a:=s1;
  777. for i:=0 to len1-1 do
  778. if s1[i]<>#0 then
  779. a[i]:=s1[i]
  780. else
  781. a[i]:=#32;
  782. a[len1]:=#0;
  783. if not(canmodifys2) then
  784. getmem(b,len2+1)
  785. else
  786. b:=s2;
  787. for i:=0 to len2-1 do
  788. if s2[i]<>#0 then
  789. b[i]:=s2[i]
  790. else
  791. b[i]:=#32;
  792. b[len2]:=#0;
  793. result:=strcoll(a,b);
  794. if not(canmodifys1) then
  795. freemem(a);
  796. if not(canmodifys2) then
  797. freemem(b);
  798. end;
  799. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  800. begin
  801. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  802. end;
  803. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  804. begin
  805. result:=strcoll(s1,s2);
  806. end;
  807. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  808. var
  809. a, b: AnsiString;
  810. begin
  811. a:=UpperAnsistring(s1);
  812. b:=UpperAnsistring(s2);
  813. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  814. end;
  815. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  816. begin
  817. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  818. end;
  819. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  820. var
  821. a, b: pchar;
  822. begin
  823. if (maxlen=0) then
  824. exit(0);
  825. if (s1[maxlen]<>#0) then
  826. begin
  827. getmem(a,maxlen+1);
  828. move(s1^,a^,maxlen);
  829. a[maxlen]:=#0;
  830. end
  831. else
  832. a:=s1;
  833. if (s2[maxlen]<>#0) then
  834. begin
  835. getmem(b,maxlen+1);
  836. move(s2^,b^,maxlen);
  837. b[maxlen]:=#0;
  838. end
  839. else
  840. b:=s2;
  841. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  842. if (a<>s1) then
  843. freemem(a);
  844. if (b<>s2) then
  845. freemem(b);
  846. end;
  847. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  848. var
  849. a, b: ansistring;
  850. begin
  851. if (maxlen=0) then
  852. exit(0);
  853. setlength(a,maxlen);
  854. move(s1^,a[1],maxlen);
  855. setlength(b,maxlen);
  856. move(s2^,b[1],maxlen);
  857. result:=AnsiCompareText(a,b);
  858. end;
  859. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  860. var
  861. newlen: sizeint;
  862. begin
  863. newlen:=length(s);
  864. if newlen>strlen(orgp) then
  865. fpc_rangeerror;
  866. p:=orgp;
  867. if (newlen>0) then
  868. move(s[1],p[0],newlen);
  869. p[newlen]:=#0;
  870. end;
  871. function AnsiStrLower(Str: PChar): PChar;
  872. var
  873. temp: ansistring;
  874. begin
  875. temp:=loweransistring(str);
  876. ansi2pchar(temp,str,result);
  877. end;
  878. function AnsiStrUpper(Str: PChar): PChar;
  879. var
  880. temp: ansistring;
  881. begin
  882. temp:=upperansistring(str);
  883. ansi2pchar(temp,str,result);
  884. end;
  885. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  886. var
  887. langinfo: pchar;
  888. begin
  889. langinfo:=nl_langinfo(CODESET);
  890. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  891. that causes it to return an empty string of UTF-8 locales
  892. -> patch up (and in general, UTF-8 is a good default on
  893. Unix platforms) }
  894. if not assigned(langinfo) or
  895. (langinfo^=#0) then
  896. langinfo:='UTF-8';
  897. Result := GetCodepageByName(ansistring(langinfo));
  898. end;
  899. {$ifdef FPC_HAS_CPSTRING}
  900. {$i textrec.inc}
  901. procedure SetStdIOCodePage(var T: Text); inline;
  902. begin
  903. case TextRec(T).Mode of
  904. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  905. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  906. end;
  907. end;
  908. procedure SetStdIOCodePages; inline;
  909. begin
  910. SetStdIOCodePage(Input);
  911. SetStdIOCodePage(Output);
  912. SetStdIOCodePage(ErrOutput);
  913. SetStdIOCodePage(StdOut);
  914. SetStdIOCodePage(StdErr);
  915. end;
  916. {$endif FPC_HAS_CPSTRING}
  917. Procedure SetCWideStringManager;
  918. Var
  919. CWideStringManager : TUnicodeStringManager;
  920. begin
  921. CWideStringManager:=widestringmanager;
  922. With CWideStringManager do
  923. begin
  924. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  925. Ansi2WideMoveProc:=@Ansi2WideMove;
  926. UpperWideStringProc:=@UpperWideString;
  927. LowerWideStringProc:=@LowerWideString;
  928. CompareWideStringProc:=@CompareWideString;
  929. CompareTextWideStringProc:=@CompareTextWideString;
  930. CharLengthPCharProc:=@CharLengthPChar;
  931. CodePointLengthProc:=@CodePointLength;
  932. UpperAnsiStringProc:=@UpperAnsiString;
  933. LowerAnsiStringProc:=@LowerAnsiString;
  934. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  935. CompareTextAnsiStringProc:=@AnsiCompareText;
  936. StrCompAnsiStringProc:=@StrCompAnsi;
  937. StrICompAnsiStringProc:=@AnsiStrIComp;
  938. StrLCompAnsiStringProc:=@AnsiStrLComp;
  939. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  940. StrLowerAnsiStringProc:=@AnsiStrLower;
  941. StrUpperAnsiStringProc:=@AnsiStrUpper;
  942. ThreadInitProc:=@InitThread;
  943. ThreadFiniProc:=@FiniThread;
  944. { Unicode }
  945. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  946. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  947. UpperUnicodeStringProc:=@UpperWideString;
  948. LowerUnicodeStringProc:=@LowerWideString;
  949. CompareUnicodeStringProc:=@CompareWideString;
  950. CompareTextUnicodeStringProc:=@CompareTextWideString;
  951. { CodePage }
  952. GetStandardCodePageProc:=@GetStandardCodePage;
  953. end;
  954. SetUnicodeStringManager(CWideStringManager);
  955. end;
  956. var
  957. iconvlib:TLibHandle;
  958. initialization
  959. SetCWideStringManager;
  960. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  961. { with the information from the environment variables according to POSIX }
  962. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  963. setlocale(LC_ALL,'');
  964. { load iconvctl function }
  965. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  966. if iconvlib<>0 then
  967. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  968. { set the DefaultSystemCodePage }
  969. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  970. {$ifdef FPC_HAS_CPSTRING}
  971. SetStdIOCodePages;
  972. {$endif FPC_HAS_CPSTRING}
  973. { init conversion tables for main program }
  974. InitThread;
  975. finalization
  976. { fini conversion tables for main program }
  977. FiniThread;
  978. { unload iconv library }
  979. if iconvlib<>0 then
  980. FreeLibrary(iconvlib);
  981. end.