cwstring.pp 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(linux) or defined(solaris)}
  22. {$define iconv_is_in_libc}
  23. {$endif}
  24. {$ifdef netbsd}
  25. {$ifndef DISABLE_ICONV_LIBC}
  26. {$define iconv_is_in_libc}
  27. {$endif}
  28. {$endif}
  29. {$ifndef iconv_is_in_libc}
  30. {$if defined(haiku)}
  31. {$linklib textencoding}
  32. {$else}
  33. {$linklib iconv}
  34. {$endif}
  35. {$define useiconv}
  36. {$endif not iconv_is_in_libc}
  37. {$i rtldefs.inc}
  38. Uses
  39. BaseUnix,
  40. ctypes,
  41. unix,
  42. unixtype,
  43. initc,
  44. dynlibs,
  45. unixcp;
  46. Const
  47. {$ifndef useiconv}
  48. libiconvname='c'; // is in libc under Linux.
  49. libprefix='lib';
  50. {$else}
  51. {$ifdef haiku}
  52. libiconvname='textencoding'; // is in libtextencoding under Haiku
  53. libprefix='lib';
  54. {$else}
  55. {$if defined(darwin) or defined(aix)}
  56. libiconvname='libiconv';
  57. libprefix='';
  58. {$else}
  59. libiconvname='iconv';
  60. libprefix='lib';
  61. {$endif}
  62. {$endif}
  63. {$endif}
  64. { helper functions from libc }
  65. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  66. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  67. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  68. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  69. {$ifdef netbsd}
  70. { NetBSD has a new setlocale function defined in /usr/include/locale.h
  71. that should be used }
  72. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name '__setlocale_mb_len_max_32';
  73. {$else}
  74. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  75. {$endif}
  76. {$if not(defined(beos) and not defined(haiku))}
  77. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  78. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  79. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  80. {$else beos}
  81. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  82. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  83. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  84. {$endif beos}
  85. const
  86. {$if defined(linux)}
  87. __LC_CTYPE = 0;
  88. LC_ALL = 6;
  89. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  90. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  91. CODESET = _NL_CTYPE_CODESET_NAME;
  92. {$elseif defined(darwin)}
  93. CODESET = 0;
  94. LC_ALL = 0;
  95. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  96. __LC_CTYPE = 0;
  97. LC_ALL = 0;
  98. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  99. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  100. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  101. {$elseif defined(solaris)}
  102. {$define ACCEPT_646}
  103. CODESET=49;
  104. LC_ALL = 6;
  105. {$elseif defined(beos)}
  106. {$ifdef haiku}
  107. CODESET= 0; // Checked for Haiku
  108. LC_ALL = 0; // Checked for Haiku
  109. {$else}
  110. {$warning check correct value for BeOS}
  111. CODESET=49;
  112. LC_ALL = 6; // Checked for BeOS
  113. {$endif}
  114. ESysEILSEQ = EILSEQ;
  115. {$elseif defined(OpenBSD)}
  116. {$define ACCEPT_646}
  117. CODESET = 51;
  118. LC_ALL = 0;
  119. {$elseif defined(NetBSD)}
  120. {$define ACCEPT_646}
  121. CODESET = 51;
  122. LC_ALL = 0;
  123. {$elseif defined(aix)}
  124. CODESET = 49;
  125. LC_ALL = -1;
  126. {$elseif defined(dragonfly)}
  127. CODESET = 0;
  128. LC_ALL = 0;
  129. __LC_CTYPE = 0;
  130. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  131. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  132. {$else not aix}
  133. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  134. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  135. {$endif}
  136. { unicode encoding name }
  137. {$ifdef FPC_LITTLE_ENDIAN}
  138. unicode_encoding2 = 'UTF-16LE';
  139. unicode_encoding4 = 'UCS-4LE';
  140. {$else FPC_LITTLE_ENDIAN}
  141. {$ifdef AIX}
  142. unicode_encoding2 = 'UTF-16';
  143. {$else AIX}
  144. unicode_encoding2 = 'UTF-16BE';
  145. unicode_encoding4 = 'UCS-4BE';
  146. {$endif AIX}
  147. {$endif FPC_LITTLE_ENDIAN}
  148. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  149. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  150. { that's a libc macro mapped to internal functions/variables }
  151. { and thus not a stable external API on systems where libc }
  152. { breaks backwards compatibility every now and then }
  153. MB_CUR_MAX = 10;
  154. { Requests for iconvctl }
  155. ICONV_TRIVIALP = 0; // int *argument
  156. ICONV_GET_TRANSLITERATE = 1; // int *argument
  157. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  158. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  159. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  160. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  161. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  162. type
  163. piconv_t = ^iconv_t;
  164. iconv_t = pointer;
  165. nl_item = cint;
  166. {$ifdef haiku}
  167. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'root' name 'nl_langinfo';
  168. {$else}
  169. {$ifndef beos}
  170. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  171. {$endif}
  172. {$endif}
  173. {$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
  174. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  175. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  176. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  177. const
  178. iconvctlname='iconvctl';
  179. {$else}
  180. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  181. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  182. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  183. const
  184. iconvctlname='libiconvctl';
  185. {$endif}
  186. var
  187. iconvctl:function(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;
  188. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  189. threadvar
  190. iconv_ansi2wide,
  191. iconv_wide2ansi : iconv_t;
  192. { since we cache the iconv_t converters, we have to do the same
  193. for the DefaultSystemCodePage variable since if it changes, we
  194. have to re-initialize the converters too. We can't do that via
  195. a callback in the widestring manager because DefaultSystemCodePage
  196. is not a threadvar and we can't automatically change this in all
  197. threads }
  198. current_DefaultSystemCodePage: TSystemCodePage;
  199. procedure InitThread;
  200. var
  201. transliterate: cint;
  202. iconvindex: longint;
  203. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  204. iconvname: rawbytestring;
  205. {$endif}
  206. begin
  207. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  208. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  209. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  210. if iconvindex<>-1 then
  211. iconvname:=UnixCpMap[iconvindex].name
  212. else
  213. { default to UTF-8 on Unix platforms }
  214. iconvname:='UTF-8';
  215. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  216. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  217. {$else}
  218. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  219. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  220. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  221. {$endif}
  222. if assigned(iconvctl) and
  223. (iconv_wide2ansi<>iconv_t(-1)) then
  224. begin
  225. transliterate:=1;
  226. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  227. end;
  228. end;
  229. procedure FiniThread;
  230. begin
  231. if (iconv_wide2ansi <> iconv_t(-1)) then
  232. iconv_close(iconv_wide2ansi);
  233. if (iconv_ansi2wide <> iconv_t(-1)) then
  234. iconv_close(iconv_ansi2wide);
  235. end;
  236. {$if defined(beos) and not defined(haiku)}
  237. function nl_langinfo(__item:nl_item):pchar;
  238. begin
  239. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  240. // Now implement the minimum required to correctly initialize WideString support
  241. case __item of
  242. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  243. else
  244. begin
  245. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  246. Result := '';
  247. end;
  248. end;
  249. end;
  250. {$endif}
  251. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
  252. var
  253. iconvindex: longint;
  254. begin
  255. { TODO: add caching (then we also don't need separate code for
  256. the default system page and other ones)
  257. -- typecasting an ansistring function result to pchar is
  258. unsafe normally, but these are constant strings -> no
  259. problem }
  260. open_iconv_for_cps:=iconv_t(-1);
  261. iconvindex:=GetCodepageData(cp);
  262. if iconvindex=-1 then
  263. exit;
  264. repeat
  265. if cp_is_from then
  266. open_iconv_for_cps:=iconv_open(otherencoding,pchar(UnixCpMap[iconvindex].name))
  267. else
  268. open_iconv_for_cps:=iconv_open(pchar(UnixCpMap[iconvindex].name),otherencoding);
  269. inc(iconvindex);
  270. until (open_iconv_for_cps<>iconv_t(-1)) or
  271. (iconvindex>high(UnixCpMap)) or
  272. (UnixCpMap[iconvindex].cp<>cp);
  273. end;
  274. {$ifdef aix}
  275. {$i cwstraix.inc}
  276. {$endif aix}
  277. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  278. var
  279. outlength,
  280. outoffset,
  281. srclen,
  282. outleft : size_t;
  283. use_iconv: iconv_t;
  284. srcpos : pwidechar;
  285. destpos: pchar;
  286. mynil : pchar;
  287. my0 : size_t;
  288. err : longint;
  289. transliterate: cint;
  290. free_iconv: boolean;
  291. {$ifdef aix}
  292. intermediate: rawbytestring;
  293. {$endif aix}
  294. begin
  295. {$ifdef aix}
  296. { AIX libiconv does not support converting cp866 to anything else except
  297. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  298. if cp=866 then
  299. begin
  300. Wide2AnsiMove(source,intermediate,28595,len);
  301. if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
  302. exit;
  303. end;
  304. {$endif aix}
  305. if (cp=DefaultSystemCodePage) then
  306. begin
  307. { update iconv converter in case the DefaultSystemCodePage has been
  308. changed }
  309. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  310. begin
  311. FiniThread;
  312. InitThread;
  313. end;
  314. use_iconv:=iconv_wide2ansi;
  315. free_iconv:=false;
  316. end
  317. else
  318. begin
  319. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  320. if (use_iconv<>iconv_t(-1)) and
  321. assigned(iconvctl) then
  322. begin
  323. transliterate:=1;
  324. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  325. end;
  326. free_iconv:=true;
  327. end;
  328. { unsupported encoding -> default move }
  329. if use_iconv=iconv_t(-1) then
  330. begin
  331. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  332. exit;
  333. end;
  334. mynil:=nil;
  335. my0:=0;
  336. { rought estimation }
  337. setlength(dest,len*3);
  338. outlength:=len*3;
  339. srclen:=len*2;
  340. srcpos:=source;
  341. destpos:=pchar(dest);
  342. outleft:=outlength;
  343. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  344. begin
  345. err:=fpgetCerrno;
  346. case err of
  347. { last character is incomplete sequence }
  348. ESysEINVAL,
  349. { incomplete sequence in the middle }
  350. ESysEILSEQ:
  351. begin
  352. { skip and set to '?' }
  353. inc(srcpos);
  354. dec(srclen,2);
  355. destpos^:='?';
  356. inc(destpos);
  357. dec(outleft);
  358. { reset }
  359. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  360. if err=ESysEINVAL then
  361. break;
  362. end;
  363. ESysE2BIG:
  364. begin
  365. outoffset:=destpos-pchar(dest);
  366. { extend }
  367. setlength(dest,outlength+len*3);
  368. inc(outleft,len*3);
  369. inc(outlength,len*3);
  370. { string could have been moved }
  371. destpos:=pchar(dest)+outoffset;
  372. end;
  373. else
  374. runerror(231);
  375. end;
  376. end;
  377. // truncate string
  378. setlength(dest,length(dest)-outleft);
  379. SetCodePage(dest,cp,false);
  380. if free_iconv then
  381. iconv_close(use_iconv);
  382. end;
  383. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  384. var
  385. outlength,
  386. outoffset,
  387. outleft : size_t;
  388. use_iconv: iconv_t;
  389. srcpos,
  390. destpos: pchar;
  391. mynil : pchar;
  392. my0 : size_t;
  393. err: cint;
  394. iconvindex: longint;
  395. free_iconv: boolean;
  396. {$ifdef aix}
  397. intermediate: rawbytestring;
  398. {$endif aix}
  399. begin
  400. {$ifdef aix}
  401. { AIX libiconv does not support converting cp866 to anything else except
  402. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  403. if cp=866 then
  404. begin
  405. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  406. source:=pchar(intermediate);
  407. end;
  408. {$endif aix}
  409. if (cp=DefaultSystemCodePage) then
  410. begin
  411. { update iconv converter in case the DefaultSystemCodePage has been
  412. changed }
  413. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  414. begin
  415. FiniThread;
  416. InitThread;
  417. end;
  418. use_iconv:=iconv_ansi2wide;
  419. free_iconv:=false;
  420. end
  421. else
  422. begin
  423. { TODO: add caching (then we also don't need separate code for
  424. the default system page and other ones)
  425. -- typecasting an ansistring function result to pchar is
  426. unsafe normally, but these are constant strings -> no
  427. problem }
  428. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  429. free_iconv:=true;
  430. end;
  431. { unsupported encoding -> default move }
  432. if use_iconv=iconv_t(-1) then
  433. begin
  434. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  435. exit;
  436. end;
  437. mynil:=nil;
  438. my0:=0;
  439. // extra space
  440. outlength:=len+1;
  441. setlength(dest,outlength);
  442. srcpos:=source;
  443. destpos:=pchar(dest);
  444. outleft:=outlength*2;
  445. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  446. begin
  447. err:=fpgetCerrno;
  448. case err of
  449. ESysEINVAL,
  450. ESysEILSEQ:
  451. begin
  452. { skip and set to '?' }
  453. inc(srcpos);
  454. dec(len);
  455. pwidechar(destpos)^:='?';
  456. inc(destpos,2);
  457. dec(outleft,2);
  458. { reset }
  459. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  460. if err=ESysEINVAL then
  461. break;
  462. end;
  463. ESysE2BIG:
  464. begin
  465. outoffset:=destpos-pchar(dest);
  466. { extend }
  467. setlength(dest,outlength+len);
  468. inc(outleft,len*2);
  469. inc(outlength,len);
  470. { string could have been moved }
  471. destpos:=pchar(dest)+outoffset;
  472. end;
  473. else
  474. runerror(231);
  475. end;
  476. end;
  477. // truncate string
  478. setlength(dest,length(dest)-outleft div 2);
  479. if free_iconv then
  480. iconv_close(use_iconv);
  481. end;
  482. function LowerWideString(const s : WideString) : WideString;
  483. var
  484. i : SizeInt;
  485. begin
  486. SetLength(result,length(s));
  487. for i:=0 to length(s)-1 do
  488. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  489. end;
  490. function UpperWideString(const s : WideString) : WideString;
  491. var
  492. i : SizeInt;
  493. begin
  494. SetLength(result,length(s));
  495. for i:=0 to length(s)-1 do
  496. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  497. end;
  498. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  499. begin
  500. if (len>length(s)) then
  501. if (length(s) < 10*256) then
  502. setlength(s,length(s)+10)
  503. else
  504. setlength(s,length(s)+length(s) shr 8);
  505. end;
  506. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  507. begin
  508. EnsureAnsiLen(s,index);
  509. pchar(@s[index])^:=c;
  510. inc(index);
  511. end;
  512. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  513. {$if not(defined(beos) and not defined(haiku))}
  514. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  515. {$else not beos}
  516. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  517. {$endif beos}
  518. var
  519. p : pchar;
  520. mblen : size_t;
  521. begin
  522. { we know that s is unique -> avoid uniquestring calls}
  523. p:=@s[index];
  524. if (nc<=127) then
  525. ConcatCharToAnsiStr(char(nc),s,index)
  526. else
  527. begin
  528. EnsureAnsiLen(s,index+MB_CUR_MAX);
  529. {$if not(defined(beos) and not defined(haiku))}
  530. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  531. {$else not beos}
  532. mblen:=wctomb(p,wchar_t(nc));
  533. {$endif not beos}
  534. if (mblen<>size_t(-1)) then
  535. inc(index,mblen)
  536. else
  537. begin
  538. { invalid wide char }
  539. p^:='?';
  540. inc(index);
  541. end;
  542. end;
  543. end;
  544. function LowerAnsiString(const s : AnsiString) : AnsiString;
  545. var
  546. i, slen,
  547. resindex : SizeInt;
  548. mblen : size_t;
  549. {$if not(defined(beos) and not defined(haiku))}
  550. ombstate,
  551. nmbstate : mbstate_t;
  552. {$endif beos}
  553. wc : wchar_t;
  554. begin
  555. {$if not(defined(beos) and not defined(haiku))}
  556. fillchar(ombstate,sizeof(ombstate),0);
  557. fillchar(nmbstate,sizeof(nmbstate),0);
  558. {$endif beos}
  559. slen:=length(s);
  560. SetLength(result,slen+10);
  561. i:=1;
  562. resindex:=1;
  563. while (i<=slen) do
  564. begin
  565. if (s[i]<=#127) then
  566. begin
  567. wc:=wchar_t(s[i]);
  568. mblen:= 1;
  569. end
  570. else
  571. {$if not(defined(beos) and not defined(haiku))}
  572. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  573. {$else not beos}
  574. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  575. {$endif not beos}
  576. case mblen of
  577. size_t(-2):
  578. begin
  579. { partial invalid character, copy literally }
  580. while (i<=slen) do
  581. begin
  582. ConcatCharToAnsiStr(s[i],result,resindex);
  583. inc(i);
  584. end;
  585. end;
  586. size_t(-1), 0:
  587. begin
  588. { invalid or null character }
  589. ConcatCharToAnsiStr(s[i],result,resindex);
  590. inc(i);
  591. end;
  592. else
  593. begin
  594. { a valid sequence }
  595. { even if mblen = 1, the lowercase version may have a }
  596. { different length }
  597. { We can't do anything special if wchar_t is 16 bit... }
  598. {$if not(defined(beos) and not defined(haiku))}
  599. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  600. {$else not beos}
  601. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  602. {$endif not beos}
  603. inc(i,mblen);
  604. end;
  605. end;
  606. end;
  607. SetLength(result,resindex-1);
  608. end;
  609. function UpperAnsiString(const s : AnsiString) : AnsiString;
  610. var
  611. i, slen,
  612. resindex : SizeInt;
  613. mblen : size_t;
  614. {$if not(defined(beos) and not defined(haiku))}
  615. ombstate,
  616. nmbstate : mbstate_t;
  617. {$endif beos}
  618. wc : wchar_t;
  619. begin
  620. {$if not(defined(beos) and not defined(haiku))}
  621. fillchar(ombstate,sizeof(ombstate),0);
  622. fillchar(nmbstate,sizeof(nmbstate),0);
  623. {$endif beos}
  624. slen:=length(s);
  625. SetLength(result,slen+10);
  626. i:=1;
  627. resindex:=1;
  628. while (i<=slen) do
  629. begin
  630. if (s[i]<=#127) then
  631. begin
  632. wc:=wchar_t(s[i]);
  633. mblen:= 1;
  634. end
  635. else
  636. {$if not(defined(beos) and not defined(haiku))}
  637. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  638. {$else not beos}
  639. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  640. {$endif beos}
  641. case mblen of
  642. size_t(-2):
  643. begin
  644. { partial invalid character, copy literally }
  645. while (i<=slen) do
  646. begin
  647. ConcatCharToAnsiStr(s[i],result,resindex);
  648. inc(i);
  649. end;
  650. end;
  651. size_t(-1), 0:
  652. begin
  653. { invalid or null character }
  654. ConcatCharToAnsiStr(s[i],result,resindex);
  655. inc(i);
  656. end;
  657. else
  658. begin
  659. { a valid sequence }
  660. { even if mblen = 1, the uppercase version may have a }
  661. { different length }
  662. { We can't do anything special if wchar_t is 16 bit... }
  663. {$if not(defined(beos) and not defined(haiku))}
  664. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  665. {$else not beos}
  666. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  667. {$endif not beos}
  668. inc(i,mblen);
  669. end;
  670. end;
  671. end;
  672. SetLength(result,resindex-1);
  673. end;
  674. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  675. var
  676. i, slen,
  677. destindex : SizeInt;
  678. uch : UCS4Char;
  679. begin
  680. slen:=length(s);
  681. setlength(result,slen+1);
  682. i:=1;
  683. destindex:=0;
  684. while (i<=slen) do
  685. begin
  686. uch:=UCS4Char(s[i]);
  687. if (uch=0) then
  688. result[destindex]:=32
  689. else if (uch<=$d7ff) or (uch>=$e000) then
  690. result[destindex]:=uch
  691. else if (uch<=$dbff) and
  692. (i<slen) and
  693. (s[i+1]>=#$dc00) and
  694. (s[i+1]<=#$dfff) then
  695. begin
  696. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  697. inc(i);
  698. end
  699. else { invalid surrogate pair }
  700. result[destindex]:=uch;
  701. inc(i);
  702. inc(destindex);
  703. end;
  704. result[destindex]:=UCS4Char(0);
  705. { Trimming length in this particular case is just a waste of time,
  706. because result will be interpreted as null-terminated and discarded
  707. almost immediately }
  708. end;
  709. function CompareWideString(const s1, s2 : WideString; Options : TCompareOptions) : PtrInt;
  710. {$if not(defined (aix) and defined(cpupowerpc32))}
  711. var
  712. hs1,hs2 : UCS4String;
  713. us1,us2 : WideString;
  714. begin
  715. { wcscoll interprets null chars as end-of-string -> filter out }
  716. if coIgnoreCase in Options then
  717. begin
  718. us1:=UpperWideString(s1);
  719. us2:=UpperWideString(s2);
  720. end
  721. else
  722. begin
  723. us1:=s1;
  724. us2:=s2;
  725. end;
  726. hs1:=WideStringToUCS4StringNoNulls(us1);
  727. hs2:=WideStringToUCS4StringNoNulls(us2);
  728. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  729. end;
  730. {$else}
  731. { AIX/PPC32 has a 16 bit wchar_t }
  732. var
  733. i, len: longint;
  734. us1,us2 : WideString;
  735. hs1, hs2: array of widechar;
  736. begin
  737. if coIgnoreCase in Options then
  738. begin
  739. us1:=UpperWideString(s1);
  740. us2:=UpperWideString(s2);
  741. end
  742. else
  743. begin
  744. us1:=s1;
  745. us2:=s2;
  746. end;
  747. len:=length(us1);
  748. setlength(hs1,len+1);
  749. for i:=1 to len do
  750. if us1[i]<>#0 then
  751. hs1[i-1]:=us1[i]
  752. else
  753. hs1[i-1]:=#32;
  754. hs1[len]:=#0;
  755. len:=length(us2);
  756. setlength(hs2,len+1);
  757. for i:=1 to len do
  758. if us2[i]<>#0 then
  759. hs2[i-1]:=us2[i]
  760. else
  761. hs2[i-1]:=#32;
  762. hs2[len]:=#0;
  763. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  764. end;
  765. {$endif}
  766. { return value: number of code points in the string. Whenever an invalid
  767. code point is encountered, all characters part of this invalid code point
  768. are considered to form one "character" and the next character is
  769. considered to be the start of a new (possibly also invalid) code point }
  770. function CharLengthPChar(const Str: PChar): PtrInt;
  771. var
  772. nextlen: ptrint;
  773. s: pchar;
  774. {$if not(defined(beos) and not defined(haiku))}
  775. mbstate: mbstate_t;
  776. {$endif not beos}
  777. begin
  778. result:=0;
  779. s:=str;
  780. {$if not(defined(beos) and not defined(haiku))}
  781. fillchar(mbstate,sizeof(mbstate),0);
  782. {$endif not beos}
  783. repeat
  784. {$if defined(beos) and not defined(haiku)}
  785. nextlen:=ptrint(mblen(s,MB_CUR_MAX));
  786. {$else beos}
  787. nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate));
  788. {$endif beos}
  789. { skip invalid/incomplete sequences }
  790. if (nextlen<0) then
  791. nextlen:=1;
  792. inc(result,1);
  793. inc(s,nextlen);
  794. until (nextlen=0);
  795. end;
  796. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  797. {$if not(defined(beos) and not defined(haiku))}
  798. var
  799. mbstate: mbstate_t;
  800. {$endif not beos}
  801. begin
  802. {$if defined(beos) and not defined(haiku)}
  803. result:=ptrint(mblen(str,maxlookahead));
  804. {$else beos}
  805. fillchar(mbstate,sizeof(mbstate),0);
  806. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  807. { mbrlen can also return -2 for "incomplete but potially valid character
  808. and data has been processed" }
  809. if result<0 then
  810. result:=-1;
  811. {$endif beos}
  812. end;
  813. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  814. var
  815. a,b: pchar;
  816. i: PtrInt;
  817. begin
  818. if not(canmodifys1) then
  819. getmem(a,len1+1)
  820. else
  821. a:=s1;
  822. for i:=0 to len1-1 do
  823. if s1[i]<>#0 then
  824. a[i]:=s1[i]
  825. else
  826. a[i]:=#32;
  827. a[len1]:=#0;
  828. if not(canmodifys2) then
  829. getmem(b,len2+1)
  830. else
  831. b:=s2;
  832. for i:=0 to len2-1 do
  833. if s2[i]<>#0 then
  834. b[i]:=s2[i]
  835. else
  836. b[i]:=#32;
  837. b[len2]:=#0;
  838. result:=strcoll(a,b);
  839. if not(canmodifys1) then
  840. freemem(a);
  841. if not(canmodifys2) then
  842. freemem(b);
  843. end;
  844. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  845. begin
  846. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  847. end;
  848. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  849. begin
  850. result:=strcoll(s1,s2);
  851. end;
  852. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  853. var
  854. a, b: AnsiString;
  855. begin
  856. a:=UpperAnsistring(s1);
  857. b:=UpperAnsistring(s2);
  858. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  859. end;
  860. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  861. begin
  862. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  863. end;
  864. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  865. var
  866. a, b: pchar;
  867. begin
  868. if (maxlen=0) then
  869. exit(0);
  870. if (s1[maxlen]<>#0) then
  871. begin
  872. getmem(a,maxlen+1);
  873. move(s1^,a^,maxlen);
  874. a[maxlen]:=#0;
  875. end
  876. else
  877. a:=s1;
  878. if (s2[maxlen]<>#0) then
  879. begin
  880. getmem(b,maxlen+1);
  881. move(s2^,b^,maxlen);
  882. b[maxlen]:=#0;
  883. end
  884. else
  885. b:=s2;
  886. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  887. if (a<>s1) then
  888. freemem(a);
  889. if (b<>s2) then
  890. freemem(b);
  891. end;
  892. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  893. var
  894. a, b: ansistring;
  895. begin
  896. if (maxlen=0) then
  897. exit(0);
  898. setlength(a,maxlen);
  899. move(s1^,a[1],maxlen);
  900. setlength(b,maxlen);
  901. move(s2^,b[1],maxlen);
  902. result:=AnsiCompareText(a,b);
  903. end;
  904. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  905. var
  906. newlen: sizeint;
  907. begin
  908. newlen:=length(s);
  909. if newlen>strlen(orgp) then
  910. fpc_rangeerror;
  911. p:=orgp;
  912. if (newlen>0) then
  913. move(s[1],p[0],newlen);
  914. p[newlen]:=#0;
  915. end;
  916. function AnsiStrLower(Str: PChar): PChar;
  917. var
  918. temp: ansistring;
  919. begin
  920. temp:=loweransistring(str);
  921. ansi2pchar(temp,str,result);
  922. end;
  923. function AnsiStrUpper(Str: PChar): PChar;
  924. var
  925. temp: ansistring;
  926. begin
  927. temp:=upperansistring(str);
  928. ansi2pchar(temp,str,result);
  929. end;
  930. function envvarset(const varname: pchar): boolean;
  931. var
  932. varval: pchar;
  933. begin
  934. varval:=fpgetenv(varname);
  935. result:=
  936. assigned(varval) and
  937. (varval[0]<>#0);
  938. end;
  939. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  940. var
  941. langinfo: pchar;
  942. begin
  943. {$ifdef FPCRTL_FILESYSTEM_UTF8}
  944. if stdcp=scpFileSystemSingleByte then
  945. begin
  946. result:=CP_UTF8;
  947. exit;
  948. end;
  949. {$endif}
  950. { if none of the relevant LC_* environment variables are set, fall back to
  951. UTF-8 (this happens under some versions of OS X for GUI applications, which
  952. otherwise get CP_ASCII) }
  953. if envvarset('LC_ALL') or
  954. envvarset('LC_CTYPE') or
  955. envvarset('LANG') then
  956. begin
  957. langinfo:=nl_langinfo(CODESET);
  958. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  959. that causes it to return an empty string of UTF-8 locales
  960. -> patch up (and in general, UTF-8 is a good default on
  961. Unix platforms) }
  962. if not assigned(langinfo) or
  963. (langinfo^=#0) then
  964. langinfo:='UTF-8';
  965. Result:=GetCodepageByName(ansistring(langinfo));
  966. end
  967. else
  968. Result:=unixcp.GetSystemCodepage;
  969. end;
  970. {$ifdef FPC_HAS_CPSTRING}
  971. procedure SetStdIOCodePage(var T: Text); inline;
  972. begin
  973. case TextRec(T).Mode of
  974. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  975. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  976. end;
  977. end;
  978. procedure SetStdIOCodePages; inline;
  979. begin
  980. SetStdIOCodePage(Input);
  981. SetStdIOCodePage(Output);
  982. SetStdIOCodePage(ErrOutput);
  983. SetStdIOCodePage(StdOut);
  984. SetStdIOCodePage(StdErr);
  985. end;
  986. {$endif FPC_HAS_CPSTRING}
  987. var
  988. OrgWideStringManager: TUnicodeStringManager;
  989. Procedure SetCWideStringManager;
  990. Var
  991. CWideStringManager : TUnicodeStringManager;
  992. begin
  993. GetUnicodeStringManager(OrgWideStringManager);
  994. CWideStringManager:=OrgWideStringManager;
  995. With CWideStringManager do
  996. begin
  997. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  998. Ansi2WideMoveProc:=@Ansi2WideMove;
  999. UpperWideStringProc:=@UpperWideString;
  1000. LowerWideStringProc:=@LowerWideString;
  1001. CompareWideStringProc:=@CompareWideString;
  1002. // CompareTextWideStringProc:=@CompareTextWideString;
  1003. CharLengthPCharProc:=@CharLengthPChar;
  1004. CodePointLengthProc:=@CodePointLength;
  1005. UpperAnsiStringProc:=@UpperAnsiString;
  1006. LowerAnsiStringProc:=@LowerAnsiString;
  1007. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  1008. CompareTextAnsiStringProc:=@AnsiCompareText;
  1009. StrCompAnsiStringProc:=@StrCompAnsi;
  1010. StrICompAnsiStringProc:=@AnsiStrIComp;
  1011. StrLCompAnsiStringProc:=@AnsiStrLComp;
  1012. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  1013. StrLowerAnsiStringProc:=@AnsiStrLower;
  1014. StrUpperAnsiStringProc:=@AnsiStrUpper;
  1015. ThreadInitProc:=@InitThread;
  1016. ThreadFiniProc:=@FiniThread;
  1017. { Unicode }
  1018. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  1019. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  1020. UpperUnicodeStringProc:=@UpperWideString;
  1021. LowerUnicodeStringProc:=@LowerWideString;
  1022. CompareUnicodeStringProc:=@CompareWideString;
  1023. { CodePage }
  1024. GetStandardCodePageProc:=@GetStandardCodePage;
  1025. end;
  1026. SetUnicodeStringManager(CWideStringManager);
  1027. end;
  1028. var
  1029. iconvlib:TLibHandle;
  1030. initialization
  1031. SetCWideStringManager;
  1032. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  1033. { with the information from the environment variables according to POSIX }
  1034. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  1035. setlocale(LC_ALL,'');
  1036. { load iconvctl function }
  1037. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  1038. if iconvlib<>0 then
  1039. pointer(iconvctl):=GetProcAddress(iconvlib,iconvctlname);
  1040. { set the DefaultSystemCodePage }
  1041. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  1042. DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
  1043. DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
  1044. {$ifdef FPC_HAS_CPSTRING}
  1045. SetStdIOCodePages;
  1046. {$endif FPC_HAS_CPSTRING}
  1047. { init conversion tables for main program }
  1048. InitThread;
  1049. finalization
  1050. { fini conversion tables for main program }
  1051. FiniThread;
  1052. { unload iconv library }
  1053. if iconvlib<>0 then
  1054. FreeLibrary(iconvlib);
  1055. { restore previous (probably default) widestring manager so that subsequent calls
  1056. into the widestring manager won't trigger the finalized functionality }
  1057. SetWideStringManager(OrgWideStringManager);
  1058. end.