cwstring.pp 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(linux) or defined(solaris)}
  22. {$define iconv_is_in_libc}
  23. {$endif}
  24. {$ifdef netbsd}
  25. {$ifndef DISABLE_ICONV_LIBC}
  26. {$define iconv_is_in_libc}
  27. {$endif}
  28. {$endif}
  29. {$ifndef iconv_is_in_libc}
  30. {$if defined(haiku)}
  31. {$linklib textencoding}
  32. {$else}
  33. {$linklib iconv}
  34. {$endif}
  35. {$define useiconv}
  36. {$endif not iconv_is_in_libc}
  37. {$i rtldefs.inc}
  38. Uses
  39. BaseUnix,
  40. ctypes,
  41. unix,
  42. unixtype,
  43. initc,
  44. dynlibs,
  45. unixcp;
  46. Const
  47. {$ifndef useiconv}
  48. libiconvname='c'; // is in libc under Linux.
  49. libprefix='lib';
  50. {$else}
  51. {$ifdef haiku}
  52. libiconvname='textencoding'; // is in libtextencoding under Haiku
  53. libprefix='lib';
  54. {$else}
  55. {$if defined(darwin) or defined(aix)}
  56. libiconvname='libiconv';
  57. libprefix='';
  58. {$else}
  59. libiconvname='iconv';
  60. libprefix='lib';
  61. {$endif}
  62. {$endif}
  63. {$endif}
  64. { helper functions from libc }
  65. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  66. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  67. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  68. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  69. {$ifdef netbsd}
  70. { NetBSD has a new setlocale function defined in /usr/include/locale.h
  71. that should be used }
  72. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name '__setlocale50';
  73. {$else}
  74. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  75. {$endif}
  76. {$if not(defined(beos) and not defined(haiku))}
  77. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  78. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  79. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  80. {$else beos}
  81. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  82. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  83. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  84. {$endif beos}
  85. const
  86. {$if defined(linux)}
  87. __LC_CTYPE = 0;
  88. LC_ALL = 6;
  89. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  90. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  91. CODESET = _NL_CTYPE_CODESET_NAME;
  92. {$elseif defined(darwin)}
  93. CODESET = 0;
  94. LC_ALL = 0;
  95. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  96. __LC_CTYPE = 0;
  97. LC_ALL = 0;
  98. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  99. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  100. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  101. {$elseif defined(solaris)}
  102. {$define ACCEPT_646}
  103. CODESET=49;
  104. LC_ALL = 6;
  105. {$elseif defined(beos)}
  106. {$ifdef haiku}
  107. CODESET= 0; // Checked for Haiku
  108. LC_ALL = 0; // Checked for Haiku
  109. {$else}
  110. {$warning check correct value for BeOS}
  111. CODESET=49;
  112. LC_ALL = 6; // Checked for BeOS
  113. {$endif}
  114. ESysEILSEQ = EILSEQ;
  115. {$elseif defined(OpenBSD)}
  116. {$define ACCEPT_646}
  117. CODESET = 51;
  118. LC_ALL = 0;
  119. {$elseif defined(NetBSD)}
  120. {$define ACCEPT_646}
  121. CODESET = 51;
  122. LC_ALL = 0;
  123. {$elseif defined(aix)}
  124. CODESET = 49;
  125. LC_ALL = -1;
  126. {$elseif defined(dragonfly)}
  127. CODESET = 0;
  128. LC_ALL = 0;
  129. __LC_CTYPE = 0;
  130. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  131. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  132. {$else not aix}
  133. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  134. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  135. {$endif}
  136. { unicode encoding name }
  137. {$ifdef FPC_LITTLE_ENDIAN}
  138. unicode_encoding2 = 'UTF-16LE';
  139. unicode_encoding4 = 'UCS-4LE';
  140. {$else FPC_LITTLE_ENDIAN}
  141. {$ifdef AIX}
  142. unicode_encoding2 = 'UTF-16';
  143. {$else AIX}
  144. unicode_encoding2 = 'UTF-16BE';
  145. unicode_encoding4 = 'UCS-4BE';
  146. {$endif AIX}
  147. {$endif FPC_LITTLE_ENDIAN}
  148. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  149. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  150. { that's a libc macro mapped to internal functions/variables }
  151. { and thus not a stable external API on systems where libc }
  152. { breaks backwards compatibility every now and then }
  153. MB_CUR_MAX = 10;
  154. { Requests for iconvctl }
  155. ICONV_TRIVIALP = 0; // int *argument
  156. ICONV_GET_TRANSLITERATE = 1; // int *argument
  157. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  158. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  159. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  160. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  161. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  162. type
  163. piconv_t = ^iconv_t;
  164. iconv_t = pointer;
  165. nl_item = cint;
  166. {$ifdef haiku}
  167. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'root' name 'nl_langinfo';
  168. {$else}
  169. {$ifndef beos}
  170. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  171. {$endif}
  172. {$endif}
  173. {$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
  174. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  175. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  176. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  177. const
  178. iconvctlname='iconvctl';
  179. {$else}
  180. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  181. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  182. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  183. const
  184. iconvctlname='libiconvctl';
  185. {$endif}
  186. var
  187. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  188. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  189. threadvar
  190. iconv_ansi2wide,
  191. iconv_wide2ansi : iconv_t;
  192. { since we cache the iconv_t converters, we have to do the same
  193. for the DefaultSystemCodePage variable since if it changes, we
  194. have to re-initialize the converters too. We can't do that via
  195. a callback in the widestring manager because DefaultSystemCodePage
  196. is not a threadvar and we can't automatically change this in all
  197. threads }
  198. current_DefaultSystemCodePage: TSystemCodePage;
  199. procedure InitThread;
  200. var
  201. transliterate: cint;
  202. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  203. iconvindex: longint;
  204. {$endif}
  205. iconvname, toencoding: rawbytestring;
  206. begin
  207. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  208. {$if declared(iconvindex)}
  209. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  210. if iconvindex<>-1 then
  211. iconvname:=UnixCpMap[iconvindex].name
  212. else
  213. { default to UTF-8 on Unix platforms }
  214. iconvname:='UTF-8';
  215. {$else}
  216. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  217. iconvname:='UTF-8';
  218. {$endif}
  219. toencoding:=iconvname;
  220. if not assigned(iconvctl) then
  221. toencoding:=toencoding+'//TRANSLIT';
  222. iconv_wide2ansi:=iconv_open(pchar(toencoding),unicode_encoding2);
  223. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  224. if assigned(iconvctl) and
  225. (iconv_wide2ansi<>iconv_t(-1)) then
  226. begin
  227. transliterate:=1;
  228. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  229. end;
  230. end;
  231. procedure FiniThread;
  232. begin
  233. if (iconv_wide2ansi <> iconv_t(-1)) then
  234. iconv_close(iconv_wide2ansi);
  235. if (iconv_ansi2wide <> iconv_t(-1)) then
  236. iconv_close(iconv_ansi2wide);
  237. end;
  238. {$if defined(beos) and not defined(haiku)}
  239. function nl_langinfo(__item:nl_item):pchar;
  240. begin
  241. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  242. // Now implement the minimum required to correctly initialize WideString support
  243. case __item of
  244. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  245. else
  246. begin
  247. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  248. Result := '';
  249. end;
  250. end;
  251. end;
  252. {$endif}
  253. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
  254. var
  255. iconvindex: longint;
  256. toencoding: rawbytestring;
  257. transliterate: cint;
  258. begin
  259. { TODO: add caching (then we also don't need separate code for
  260. the default system page and other ones)
  261. -- typecasting an ansistring function result to pchar is
  262. unsafe normally, but these are constant strings -> no
  263. problem }
  264. open_iconv_for_cps:=iconv_t(-1);
  265. iconvindex:=GetCodepageData(cp);
  266. if iconvindex=-1 then
  267. exit;
  268. repeat
  269. if cp_is_from then
  270. open_iconv_for_cps:=iconv_open(otherencoding,pchar(UnixCpMap[iconvindex].name))
  271. else
  272. begin
  273. toencoding:=UnixCpMap[iconvindex].name;
  274. if not assigned(iconvctl) then
  275. toencoding:=toencoding+'//TRANSLIT';
  276. open_iconv_for_cps:=iconv_open(pchar(toencoding),otherencoding);
  277. end;
  278. inc(iconvindex);
  279. until (open_iconv_for_cps<>iconv_t(-1)) or
  280. (iconvindex>high(UnixCpMap)) or
  281. (UnixCpMap[iconvindex].cp<>cp);
  282. if not cp_is_from and
  283. (open_iconv_for_cps<>iconv_t(-1)) and
  284. assigned(iconvctl) then
  285. begin
  286. transliterate:=1;
  287. iconvctl(open_iconv_for_cps,ICONV_SET_TRANSLITERATE,@transliterate);
  288. end;
  289. end;
  290. {$ifdef aix}
  291. {$i cwstraix.inc}
  292. {$endif aix}
  293. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  294. var
  295. outlength,
  296. outoffset,
  297. srclen,
  298. outleft : size_t;
  299. use_iconv: iconv_t;
  300. srcpos : pwidechar;
  301. destpos: pchar;
  302. mynil : pchar;
  303. my0 : size_t;
  304. err : longint;
  305. transliterate: cint;
  306. free_iconv: boolean;
  307. {$ifdef aix}
  308. intermediate: rawbytestring;
  309. {$endif aix}
  310. begin
  311. {$ifdef aix}
  312. { AIX libiconv does not support converting cp866 to anything else except
  313. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  314. if cp=866 then
  315. begin
  316. Wide2AnsiMove(source,intermediate,28595,len);
  317. if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
  318. exit;
  319. end;
  320. {$endif aix}
  321. if (cp=DefaultSystemCodePage) then
  322. begin
  323. { update iconv converter in case the DefaultSystemCodePage has been
  324. changed }
  325. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  326. begin
  327. FiniThread;
  328. InitThread;
  329. end;
  330. use_iconv:=iconv_wide2ansi;
  331. free_iconv:=false;
  332. end
  333. else
  334. begin
  335. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  336. if (use_iconv<>iconv_t(-1)) and
  337. assigned(iconvctl) then
  338. begin
  339. transliterate:=1;
  340. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  341. end;
  342. free_iconv:=true;
  343. end;
  344. { unsupported encoding -> default move }
  345. if use_iconv=iconv_t(-1) then
  346. begin
  347. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  348. exit;
  349. end;
  350. mynil:=nil;
  351. my0:=0;
  352. { rought estimation }
  353. setlength(dest,len*3);
  354. outlength:=len*3;
  355. srclen:=len*2;
  356. srcpos:=source;
  357. destpos:=pchar(dest);
  358. outleft:=outlength;
  359. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  360. begin
  361. err:=fpgetCerrno;
  362. case err of
  363. { last character is incomplete sequence }
  364. ESysEINVAL,
  365. { incomplete sequence in the middle }
  366. ESysEILSEQ:
  367. begin
  368. { skip and set to '?' }
  369. inc(srcpos);
  370. dec(srclen,2);
  371. destpos^:='?';
  372. inc(destpos);
  373. dec(outleft);
  374. { reset }
  375. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  376. if err=ESysEINVAL then
  377. break;
  378. end;
  379. ESysE2BIG:
  380. begin
  381. outoffset:=destpos-pchar(dest);
  382. { extend }
  383. setlength(dest,outlength+len*3);
  384. inc(outleft,len*3);
  385. inc(outlength,len*3);
  386. { string could have been moved }
  387. destpos:=pchar(dest)+outoffset;
  388. end;
  389. else
  390. runerror(231);
  391. end;
  392. end;
  393. // truncate string
  394. setlength(dest,length(dest)-outleft);
  395. SetCodePage(dest,cp,false);
  396. if free_iconv then
  397. iconv_close(use_iconv);
  398. end;
  399. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  400. var
  401. outlength,
  402. outoffset,
  403. outleft : size_t;
  404. use_iconv: iconv_t;
  405. srcpos,
  406. destpos: pchar;
  407. mynil : pchar;
  408. my0 : size_t;
  409. err: cint;
  410. iconvindex: longint;
  411. free_iconv: boolean;
  412. {$ifdef aix}
  413. intermediate: rawbytestring;
  414. {$endif aix}
  415. begin
  416. {$ifdef aix}
  417. { AIX libiconv does not support converting cp866 to anything else except
  418. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  419. if cp=866 then
  420. begin
  421. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  422. source:=pchar(intermediate);
  423. end;
  424. {$endif aix}
  425. if (cp=DefaultSystemCodePage) then
  426. begin
  427. { update iconv converter in case the DefaultSystemCodePage has been
  428. changed }
  429. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  430. begin
  431. FiniThread;
  432. InitThread;
  433. end;
  434. use_iconv:=iconv_ansi2wide;
  435. free_iconv:=false;
  436. end
  437. else
  438. begin
  439. { TODO: add caching (then we also don't need separate code for
  440. the default system page and other ones)
  441. -- typecasting an ansistring function result to pchar is
  442. unsafe normally, but these are constant strings -> no
  443. problem }
  444. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  445. free_iconv:=true;
  446. end;
  447. { unsupported encoding -> default move }
  448. if use_iconv=iconv_t(-1) then
  449. begin
  450. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  451. exit;
  452. end;
  453. mynil:=nil;
  454. my0:=0;
  455. // extra space
  456. outlength:=len+1;
  457. setlength(dest,outlength);
  458. srcpos:=source;
  459. destpos:=pchar(dest);
  460. outleft:=outlength*2;
  461. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  462. begin
  463. err:=fpgetCerrno;
  464. case err of
  465. ESysEINVAL,
  466. ESysEILSEQ:
  467. begin
  468. { skip and set to '?' }
  469. inc(srcpos);
  470. dec(len);
  471. pwidechar(destpos)^:='?';
  472. inc(destpos,2);
  473. dec(outleft,2);
  474. { reset }
  475. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  476. if err=ESysEINVAL then
  477. break;
  478. end;
  479. ESysE2BIG:
  480. begin
  481. outoffset:=destpos-pchar(dest);
  482. { extend }
  483. setlength(dest,outlength+len);
  484. inc(outleft,len*2);
  485. inc(outlength,len);
  486. { string could have been moved }
  487. destpos:=pchar(dest)+outoffset;
  488. end;
  489. else
  490. runerror(231);
  491. end;
  492. end;
  493. // truncate string
  494. setlength(dest,length(dest)-outleft div 2);
  495. if free_iconv then
  496. iconv_close(use_iconv);
  497. end;
  498. function LowerWideString(const s : WideString) : WideString;
  499. var
  500. i : SizeInt;
  501. begin
  502. SetLength(result,length(s));
  503. for i:=0 to length(s)-1 do
  504. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  505. end;
  506. function UpperWideString(const s : WideString) : WideString;
  507. var
  508. i : SizeInt;
  509. begin
  510. SetLength(result,length(s));
  511. for i:=0 to length(s)-1 do
  512. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  513. end;
  514. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  515. begin
  516. if (len>length(s)) then
  517. if (length(s) < 10*256) then
  518. setlength(s,length(s)+10)
  519. else
  520. setlength(s,length(s)+length(s) shr 8);
  521. end;
  522. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  523. begin
  524. EnsureAnsiLen(s,index);
  525. pchar(@s[index])^:=c;
  526. inc(index);
  527. end;
  528. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  529. {$if not(defined(beos) and not defined(haiku))}
  530. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  531. {$else not beos}
  532. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  533. {$endif beos}
  534. var
  535. p : pchar;
  536. mblen : size_t;
  537. begin
  538. { we know that s is unique -> avoid uniquestring calls}
  539. p:=@s[index];
  540. if (nc<=127) then
  541. ConcatCharToAnsiStr(char(nc),s,index)
  542. else
  543. begin
  544. EnsureAnsiLen(s,index+MB_CUR_MAX);
  545. {$if not(defined(beos) and not defined(haiku))}
  546. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  547. {$else not beos}
  548. mblen:=wctomb(p,wchar_t(nc));
  549. {$endif not beos}
  550. if (mblen<>size_t(-1)) then
  551. inc(index,mblen)
  552. else
  553. begin
  554. { invalid wide char }
  555. p^:='?';
  556. inc(index);
  557. end;
  558. end;
  559. end;
  560. function LowerAnsiString(const s : AnsiString) : AnsiString;
  561. var
  562. i, slen,
  563. resindex : SizeInt;
  564. mblen : size_t;
  565. {$if not(defined(beos) and not defined(haiku))}
  566. ombstate,
  567. nmbstate : mbstate_t;
  568. {$endif beos}
  569. wc : wchar_t;
  570. begin
  571. {$if not(defined(beos) and not defined(haiku))}
  572. fillchar(ombstate,sizeof(ombstate),0);
  573. fillchar(nmbstate,sizeof(nmbstate),0);
  574. {$endif beos}
  575. slen:=length(s);
  576. SetLength(result,slen+10);
  577. i:=1;
  578. resindex:=1;
  579. while (i<=slen) do
  580. begin
  581. if (s[i]<=#127) then
  582. begin
  583. wc:=wchar_t(s[i]);
  584. mblen:= 1;
  585. end
  586. else
  587. {$if not(defined(beos) and not defined(haiku))}
  588. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  589. {$else not beos}
  590. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  591. {$endif not beos}
  592. case mblen of
  593. size_t(-2):
  594. begin
  595. { partial invalid character, copy literally }
  596. while (i<=slen) do
  597. begin
  598. ConcatCharToAnsiStr(s[i],result,resindex);
  599. inc(i);
  600. end;
  601. end;
  602. size_t(-1), 0:
  603. begin
  604. { invalid or null character }
  605. ConcatCharToAnsiStr(s[i],result,resindex);
  606. inc(i);
  607. end;
  608. else
  609. begin
  610. { a valid sequence }
  611. { even if mblen = 1, the lowercase version may have a }
  612. { different length }
  613. { We can't do anything special if wchar_t is 16 bit... }
  614. {$if not(defined(beos) and not defined(haiku))}
  615. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  616. {$else not beos}
  617. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  618. {$endif not beos}
  619. inc(i,mblen);
  620. end;
  621. end;
  622. end;
  623. SetLength(result,resindex-1);
  624. end;
  625. function UpperAnsiString(const s : AnsiString) : AnsiString;
  626. var
  627. i, slen,
  628. resindex : SizeInt;
  629. mblen : size_t;
  630. {$if not(defined(beos) and not defined(haiku))}
  631. ombstate,
  632. nmbstate : mbstate_t;
  633. {$endif beos}
  634. wc : wchar_t;
  635. begin
  636. {$if not(defined(beos) and not defined(haiku))}
  637. fillchar(ombstate,sizeof(ombstate),0);
  638. fillchar(nmbstate,sizeof(nmbstate),0);
  639. {$endif beos}
  640. slen:=length(s);
  641. SetLength(result,slen+10);
  642. i:=1;
  643. resindex:=1;
  644. while (i<=slen) do
  645. begin
  646. if (s[i]<=#127) then
  647. begin
  648. wc:=wchar_t(s[i]);
  649. mblen:= 1;
  650. end
  651. else
  652. {$if not(defined(beos) and not defined(haiku))}
  653. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  654. {$else not beos}
  655. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  656. {$endif beos}
  657. case mblen of
  658. size_t(-2):
  659. begin
  660. { partial invalid character, copy literally }
  661. while (i<=slen) do
  662. begin
  663. ConcatCharToAnsiStr(s[i],result,resindex);
  664. inc(i);
  665. end;
  666. end;
  667. size_t(-1), 0:
  668. begin
  669. { invalid or null character }
  670. ConcatCharToAnsiStr(s[i],result,resindex);
  671. inc(i);
  672. end;
  673. else
  674. begin
  675. { a valid sequence }
  676. { even if mblen = 1, the uppercase version may have a }
  677. { different length }
  678. { We can't do anything special if wchar_t is 16 bit... }
  679. {$if not(defined(beos) and not defined(haiku))}
  680. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  681. {$else not beos}
  682. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  683. {$endif not beos}
  684. inc(i,mblen);
  685. end;
  686. end;
  687. end;
  688. SetLength(result,resindex-1);
  689. end;
  690. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  691. var
  692. i, slen,
  693. destindex : SizeInt;
  694. uch : UCS4Char;
  695. begin
  696. slen:=length(s);
  697. setlength(result,slen+1);
  698. i:=1;
  699. destindex:=0;
  700. while (i<=slen) do
  701. begin
  702. uch:=UCS4Char(s[i]);
  703. if (uch=0) then
  704. result[destindex]:=32
  705. else if (uch<=$d7ff) or (uch>=$e000) then
  706. result[destindex]:=uch
  707. else if (uch<=$dbff) and
  708. (i<slen) and
  709. (s[i+1]>=#$dc00) and
  710. (s[i+1]<=#$dfff) then
  711. begin
  712. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  713. inc(i);
  714. end
  715. else { invalid surrogate pair }
  716. result[destindex]:=uch;
  717. inc(i);
  718. inc(destindex);
  719. end;
  720. result[destindex]:=UCS4Char(0);
  721. { Trimming length in this particular case is just a waste of time,
  722. because result will be interpreted as null-terminated and discarded
  723. almost immediately }
  724. end;
  725. function CompareWideString(const s1, s2 : WideString; Options : TCompareOptions) : PtrInt;
  726. {$if not(defined (aix) and defined(cpupowerpc32))}
  727. var
  728. hs1,hs2 : UCS4String;
  729. us1,us2 : WideString;
  730. begin
  731. { wcscoll interprets null chars as end-of-string -> filter out }
  732. if coIgnoreCase in Options then
  733. begin
  734. us1:=UpperWideString(s1);
  735. us2:=UpperWideString(s2);
  736. end
  737. else
  738. begin
  739. us1:=s1;
  740. us2:=s2;
  741. end;
  742. hs1:=WideStringToUCS4StringNoNulls(us1);
  743. hs2:=WideStringToUCS4StringNoNulls(us2);
  744. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  745. end;
  746. {$else}
  747. { AIX/PPC32 has a 16 bit wchar_t }
  748. var
  749. i, len: longint;
  750. us1,us2 : WideString;
  751. hs1, hs2: array of widechar;
  752. begin
  753. if coIgnoreCase in Options then
  754. begin
  755. us1:=UpperWideString(s1);
  756. us2:=UpperWideString(s2);
  757. end
  758. else
  759. begin
  760. us1:=s1;
  761. us2:=s2;
  762. end;
  763. len:=length(us1);
  764. setlength(hs1,len+1);
  765. for i:=1 to len do
  766. if us1[i]<>#0 then
  767. hs1[i-1]:=us1[i]
  768. else
  769. hs1[i-1]:=#32;
  770. hs1[len]:=#0;
  771. len:=length(us2);
  772. setlength(hs2,len+1);
  773. for i:=1 to len do
  774. if us2[i]<>#0 then
  775. hs2[i-1]:=us2[i]
  776. else
  777. hs2[i-1]:=#32;
  778. hs2[len]:=#0;
  779. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  780. end;
  781. {$endif}
  782. { return value: number of code points in the string. Whenever an invalid
  783. code point is encountered, all characters part of this invalid code point
  784. are considered to form one "character" and the next character is
  785. considered to be the start of a new (possibly also invalid) code point }
  786. function CharLengthPChar(const Str: PChar): PtrInt;
  787. var
  788. nextlen: ptrint;
  789. s: pchar;
  790. {$if not(defined(beos) and not defined(haiku))}
  791. mbstate: mbstate_t;
  792. {$endif not beos}
  793. begin
  794. result:=0;
  795. s:=str;
  796. {$if not(defined(beos) and not defined(haiku))}
  797. fillchar(mbstate,sizeof(mbstate),0);
  798. {$endif not beos}
  799. repeat
  800. {$if defined(beos) and not defined(haiku)}
  801. nextlen:=ptrint(mblen(s,MB_CUR_MAX));
  802. {$else beos}
  803. nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate));
  804. {$endif beos}
  805. { skip invalid/incomplete sequences }
  806. if (nextlen<0) then
  807. nextlen:=1;
  808. inc(result,1);
  809. inc(s,nextlen);
  810. until (nextlen=0);
  811. end;
  812. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  813. {$if not(defined(beos) and not defined(haiku))}
  814. var
  815. mbstate: mbstate_t;
  816. {$endif not beos}
  817. begin
  818. {$if defined(beos) and not defined(haiku)}
  819. result:=ptrint(mblen(str,maxlookahead));
  820. {$else beos}
  821. fillchar(mbstate,sizeof(mbstate),0);
  822. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  823. { mbrlen can also return -2 for "incomplete but potially valid character
  824. and data has been processed" }
  825. if result<0 then
  826. result:=-1;
  827. {$endif beos}
  828. end;
  829. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  830. var
  831. a,b: pchar;
  832. i: PtrInt;
  833. begin
  834. if not(canmodifys1) then
  835. getmem(a,len1+1)
  836. else
  837. a:=s1;
  838. for i:=0 to len1-1 do
  839. if s1[i]<>#0 then
  840. a[i]:=s1[i]
  841. else
  842. a[i]:=#32;
  843. a[len1]:=#0;
  844. if not(canmodifys2) then
  845. getmem(b,len2+1)
  846. else
  847. b:=s2;
  848. for i:=0 to len2-1 do
  849. if s2[i]<>#0 then
  850. b[i]:=s2[i]
  851. else
  852. b[i]:=#32;
  853. b[len2]:=#0;
  854. result:=strcoll(a,b);
  855. if not(canmodifys1) then
  856. freemem(a);
  857. if not(canmodifys2) then
  858. freemem(b);
  859. end;
  860. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  861. begin
  862. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  863. end;
  864. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  865. begin
  866. result:=strcoll(s1,s2);
  867. end;
  868. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  869. var
  870. a, b: AnsiString;
  871. begin
  872. a:=UpperAnsistring(s1);
  873. b:=UpperAnsistring(s2);
  874. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  875. end;
  876. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  877. begin
  878. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  879. end;
  880. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  881. var
  882. a, b: pchar;
  883. begin
  884. if (maxlen=0) then
  885. exit(0);
  886. if (s1[maxlen]<>#0) then
  887. begin
  888. getmem(a,maxlen+1);
  889. move(s1^,a^,maxlen);
  890. a[maxlen]:=#0;
  891. end
  892. else
  893. a:=s1;
  894. if (s2[maxlen]<>#0) then
  895. begin
  896. getmem(b,maxlen+1);
  897. move(s2^,b^,maxlen);
  898. b[maxlen]:=#0;
  899. end
  900. else
  901. b:=s2;
  902. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  903. if (a<>s1) then
  904. freemem(a);
  905. if (b<>s2) then
  906. freemem(b);
  907. end;
  908. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  909. var
  910. a, b: ansistring;
  911. begin
  912. if (maxlen=0) then
  913. exit(0);
  914. setlength(a,maxlen);
  915. move(s1^,a[1],maxlen);
  916. setlength(b,maxlen);
  917. move(s2^,b[1],maxlen);
  918. result:=AnsiCompareText(a,b);
  919. end;
  920. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  921. var
  922. newlen: sizeint;
  923. begin
  924. newlen:=length(s);
  925. if newlen>strlen(orgp) then
  926. fpc_rangeerror;
  927. p:=orgp;
  928. if (newlen>0) then
  929. move(s[1],p[0],newlen);
  930. p[newlen]:=#0;
  931. end;
  932. function AnsiStrLower(Str: PChar): PChar;
  933. var
  934. temp: ansistring;
  935. begin
  936. temp:=loweransistring(str);
  937. ansi2pchar(temp,str,result);
  938. end;
  939. function AnsiStrUpper(Str: PChar): PChar;
  940. var
  941. temp: ansistring;
  942. begin
  943. temp:=upperansistring(str);
  944. ansi2pchar(temp,str,result);
  945. end;
  946. function envvarset(const varname: pchar): boolean;
  947. var
  948. varval: pchar;
  949. begin
  950. varval:=fpgetenv(varname);
  951. result:=
  952. assigned(varval) and
  953. (varval[0]<>#0);
  954. end;
  955. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  956. var
  957. langinfo: pchar;
  958. begin
  959. {$ifdef FPCRTL_FILESYSTEM_UTF8}
  960. if stdcp=scpFileSystemSingleByte then
  961. begin
  962. result:=CP_UTF8;
  963. exit;
  964. end;
  965. {$endif}
  966. { if none of the relevant LC_* environment variables are set, fall back to
  967. UTF-8 (this happens under some versions of OS X for GUI applications, which
  968. otherwise get CP_ASCII) }
  969. if envvarset('LC_ALL') or
  970. envvarset('LC_CTYPE') or
  971. envvarset('LANG') then
  972. begin
  973. langinfo:=nl_langinfo(CODESET);
  974. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  975. that causes it to return an empty string of UTF-8 locales
  976. -> patch up (and in general, UTF-8 is a good default on
  977. Unix platforms) }
  978. if not assigned(langinfo) or
  979. (langinfo^=#0) then
  980. langinfo:='UTF-8';
  981. Result:=GetCodepageByName(ansistring(langinfo));
  982. end
  983. else
  984. Result:=unixcp.GetSystemCodepage;
  985. end;
  986. {$ifdef FPC_HAS_CPSTRING}
  987. procedure SetStdIOCodePage(var T: Text); inline;
  988. begin
  989. case TextRec(T).Mode of
  990. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  991. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  992. end;
  993. end;
  994. procedure SetStdIOCodePages; inline;
  995. begin
  996. SetStdIOCodePage(Input);
  997. SetStdIOCodePage(Output);
  998. SetStdIOCodePage(ErrOutput);
  999. SetStdIOCodePage(StdOut);
  1000. SetStdIOCodePage(StdErr);
  1001. end;
  1002. {$endif FPC_HAS_CPSTRING}
  1003. var
  1004. OrgWideStringManager: TUnicodeStringManager;
  1005. Procedure SetCWideStringManager;
  1006. Var
  1007. CWideStringManager : TUnicodeStringManager;
  1008. begin
  1009. GetUnicodeStringManager(OrgWideStringManager);
  1010. CWideStringManager:=OrgWideStringManager;
  1011. With CWideStringManager do
  1012. begin
  1013. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  1014. Ansi2WideMoveProc:=@Ansi2WideMove;
  1015. UpperWideStringProc:=@UpperWideString;
  1016. LowerWideStringProc:=@LowerWideString;
  1017. CompareWideStringProc:=@CompareWideString;
  1018. // CompareTextWideStringProc:=@CompareTextWideString;
  1019. CharLengthPCharProc:=@CharLengthPChar;
  1020. CodePointLengthProc:=@CodePointLength;
  1021. UpperAnsiStringProc:=@UpperAnsiString;
  1022. LowerAnsiStringProc:=@LowerAnsiString;
  1023. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  1024. CompareTextAnsiStringProc:=@AnsiCompareText;
  1025. StrCompAnsiStringProc:=@StrCompAnsi;
  1026. StrICompAnsiStringProc:=@AnsiStrIComp;
  1027. StrLCompAnsiStringProc:=@AnsiStrLComp;
  1028. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  1029. StrLowerAnsiStringProc:=@AnsiStrLower;
  1030. StrUpperAnsiStringProc:=@AnsiStrUpper;
  1031. ThreadInitProc:=@InitThread;
  1032. ThreadFiniProc:=@FiniThread;
  1033. { Unicode }
  1034. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  1035. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  1036. UpperUnicodeStringProc:=@UpperWideString;
  1037. LowerUnicodeStringProc:=@LowerWideString;
  1038. CompareUnicodeStringProc:=@CompareWideString;
  1039. { CodePage }
  1040. GetStandardCodePageProc:=@GetStandardCodePage;
  1041. end;
  1042. SetUnicodeStringManager(CWideStringManager);
  1043. end;
  1044. var
  1045. iconvlib:TLibHandle;
  1046. initialization
  1047. SetCWideStringManager;
  1048. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  1049. { with the information from the environment variables according to POSIX }
  1050. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  1051. setlocale(LC_ALL,'');
  1052. { load iconv library and iconvctl function }
  1053. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  1054. if iconvlib=0 then
  1055. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix+'.6');
  1056. if iconvlib<>0 then
  1057. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  1058. { set the DefaultSystemCodePage }
  1059. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  1060. DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
  1061. DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
  1062. {$ifdef FPC_HAS_CPSTRING}
  1063. SetStdIOCodePages;
  1064. {$endif FPC_HAS_CPSTRING}
  1065. { init conversion tables for main program }
  1066. InitThread;
  1067. finalization
  1068. { fini conversion tables for main program }
  1069. FiniThread;
  1070. { unload iconv library }
  1071. if iconvlib<>0 then
  1072. FreeLibrary(iconvlib);
  1073. { restore previous (probably default) widestring manager so that subsequent calls
  1074. into the widestring manager won't trigger the finalized functionality }
  1075. SetWideStringManager(OrgWideStringManager);
  1076. end.