cwstring.pp 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. {$implicitexceptions off}
  15. unit cwstring;
  16. interface
  17. procedure SetCWidestringManager;
  18. implementation
  19. {$linklib c}
  20. // Linux (and maybe glibc platforms in general), have iconv in glibc.
  21. {$if defined(linux) or defined(solaris)}
  22. {$define iconv_is_in_libc}
  23. {$endif}
  24. {$ifdef netbsd}
  25. {$ifndef DISABLE_ICONV_LIBC}
  26. {$define iconv_is_in_libc}
  27. {$endif}
  28. {$endif}
  29. {$ifndef iconv_is_in_libc}
  30. {$if defined(haiku)}
  31. {$linklib textencoding}
  32. {$else}
  33. {$linklib iconv}
  34. {$endif}
  35. {$define useiconv}
  36. {$endif not iconv_is_in_libc}
  37. {$i rtldefs.inc}
  38. Uses
  39. BaseUnix,
  40. ctypes,
  41. unix,
  42. unixtype,
  43. initc,
  44. dynlibs,
  45. unixcp;
  46. Const
  47. {$ifndef useiconv}
  48. libiconvname='c'; // is in libc under Linux.
  49. libprefix='lib';
  50. {$else}
  51. {$ifdef haiku}
  52. libiconvname='textencoding'; // is in libtextencoding under Haiku
  53. libprefix='lib';
  54. {$else}
  55. {$if defined(darwin) or defined(aix)}
  56. libiconvname='libiconv';
  57. libprefix='';
  58. {$else}
  59. libiconvname='iconv';
  60. libprefix='lib';
  61. {$endif}
  62. {$endif}
  63. {$endif}
  64. { helper functions from libc }
  65. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  66. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  67. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  68. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  69. {$ifdef netbsd}
  70. { NetBSD has a new setlocale function defined in /usr/include/locale.h
  71. that should be used }
  72. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name '__setlocale50';
  73. {$else}
  74. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  75. {$endif}
  76. {$if not(defined(beos) and not defined(haiku))}
  77. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  78. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  79. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  80. {$else beos}
  81. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  82. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  83. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  84. {$endif beos}
  85. const
  86. {$if defined(linux)}
  87. __LC_CTYPE = 0;
  88. LC_ALL = 6;
  89. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  90. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  91. CODESET = _NL_CTYPE_CODESET_NAME;
  92. {$elseif defined(darwin)}
  93. CODESET = 0;
  94. LC_ALL = 0;
  95. {$elseif defined(FreeBSD)} // actually FreeBSD5. internationalisation is afaik not default on 4.
  96. __LC_CTYPE = 0;
  97. LC_ALL = 0;
  98. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  99. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  100. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  101. {$elseif defined(solaris)}
  102. {$define ACCEPT_646}
  103. CODESET=49;
  104. LC_ALL = 6;
  105. {$elseif defined(beos)}
  106. {$ifdef haiku}
  107. CODESET= 0; // Checked for Haiku
  108. LC_ALL = 0; // Checked for Haiku
  109. {$else}
  110. {$warning check correct value for BeOS}
  111. CODESET=49;
  112. LC_ALL = 6; // Checked for BeOS
  113. {$endif}
  114. ESysEILSEQ = EILSEQ;
  115. {$elseif defined(OpenBSD)}
  116. {$define ACCEPT_646}
  117. CODESET = 51;
  118. LC_ALL = 0;
  119. {$elseif defined(NetBSD)}
  120. {$define ACCEPT_646}
  121. CODESET = 51;
  122. LC_ALL = 0;
  123. {$elseif defined(aix)}
  124. CODESET = 49;
  125. LC_ALL = -1;
  126. {$elseif defined(dragonfly)}
  127. CODESET = 0;
  128. LC_ALL = 0;
  129. __LC_CTYPE = 0;
  130. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  131. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  132. {$else not aix}
  133. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  134. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  135. {$endif}
  136. { unicode encoding name }
  137. {$ifdef FPC_LITTLE_ENDIAN}
  138. unicode_encoding2 = 'UTF-16LE';
  139. unicode_encoding4 = 'UCS-4LE';
  140. {$else FPC_LITTLE_ENDIAN}
  141. {$ifdef AIX}
  142. unicode_encoding2 = 'UTF-16';
  143. {$else AIX}
  144. unicode_encoding2 = 'UTF-16BE';
  145. unicode_encoding4 = 'UCS-4BE';
  146. {$endif AIX}
  147. {$endif FPC_LITTLE_ENDIAN}
  148. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  149. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  150. { that's a libc macro mapped to internal functions/variables }
  151. { and thus not a stable external API on systems where libc }
  152. { breaks backwards compatibility every now and then }
  153. MB_CUR_MAX = 10;
  154. { Requests for iconvctl }
  155. ICONV_TRIVIALP = 0; // int *argument
  156. ICONV_GET_TRANSLITERATE = 1; // int *argument
  157. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  158. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  159. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  160. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  161. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  162. type
  163. piconv_t = ^iconv_t;
  164. iconv_t = pointer;
  165. nl_item = cint;
  166. {$ifdef haiku}
  167. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'root' name 'nl_langinfo';
  168. {$else}
  169. {$ifndef beos}
  170. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  171. {$endif}
  172. {$endif}
  173. {$if (not defined(bsd) and not defined(beos)) or defined(iconv_is_in_libc) or (defined(darwin) and not defined(cpupowerpc32))}
  174. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  175. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  176. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  177. const
  178. iconvctlname='iconvctl';
  179. {$else}
  180. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  181. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  182. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  183. const
  184. iconvctlname='libiconvctl';
  185. {$endif}
  186. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  187. threadvar
  188. iconv_ansi2wide,
  189. iconv_wide2ansi : iconv_t;
  190. { since we cache the iconv_t converters, we have to do the same
  191. for the DefaultSystemCodePage variable since if it changes, we
  192. have to re-initialize the converters too. We can't do that via
  193. a callback in the widestring manager because DefaultSystemCodePage
  194. is not a threadvar and we can't automatically change this in all
  195. threads }
  196. current_DefaultSystemCodePage: TSystemCodePage;
  197. procedure InitThread;
  198. var
  199. transliterate: cint;
  200. iconvindex: longint;
  201. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  202. iconvname: rawbytestring;
  203. {$endif}
  204. begin
  205. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  206. {$if not(defined(darwin) and (defined(cpuarm) or defined(cpuaarch64))) and not defined(iphonesim)}
  207. iconvindex:=GetCodepageData(DefaultSystemCodePage);
  208. if iconvindex<>-1 then
  209. iconvname:=UnixCpMap[iconvindex].name
  210. else
  211. { default to UTF-8 on Unix platforms }
  212. iconvname:='UTF-8';
  213. iconv_wide2ansi:=iconv_open(pchar(iconvname+'//TRANSLIT'),unicode_encoding2);
  214. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  215. {$else}
  216. { Unix locale settings are ignored on iPhoneOS/iPhoneSimulator }
  217. iconv_wide2ansi:=iconv_open('UTF-8//TRANSLIT',unicode_encoding2);
  218. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  219. {$endif}
  220. end;
  221. procedure FiniThread;
  222. begin
  223. if (iconv_wide2ansi <> iconv_t(-1)) then
  224. iconv_close(iconv_wide2ansi);
  225. if (iconv_ansi2wide <> iconv_t(-1)) then
  226. iconv_close(iconv_ansi2wide);
  227. end;
  228. {$if defined(beos) and not defined(haiku)}
  229. function nl_langinfo(__item:nl_item):pchar;
  230. begin
  231. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  232. // Now implement the minimum required to correctly initialize WideString support
  233. case __item of
  234. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  235. else
  236. begin
  237. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  238. Result := '';
  239. end;
  240. end;
  241. end;
  242. {$endif}
  243. function open_iconv_for_cps(cp: TSystemCodePage; const otherencoding: pchar; cp_is_from: boolean): iconv_t;
  244. var
  245. iconvindex: longint;
  246. begin
  247. { TODO: add caching (then we also don't need separate code for
  248. the default system page and other ones)
  249. -- typecasting an ansistring function result to pchar is
  250. unsafe normally, but these are constant strings -> no
  251. problem }
  252. open_iconv_for_cps:=iconv_t(-1);
  253. iconvindex:=GetCodepageData(cp);
  254. if iconvindex=-1 then
  255. exit;
  256. repeat
  257. if cp_is_from then
  258. open_iconv_for_cps:=iconv_open(otherencoding,pchar(UnixCpMap[iconvindex].name))
  259. else
  260. open_iconv_for_cps:=iconv_open(pchar(UnixCpMap[iconvindex].name+'//TRANSLIT'),otherencoding);
  261. inc(iconvindex);
  262. until (open_iconv_for_cps<>iconv_t(-1)) or
  263. (iconvindex>high(UnixCpMap)) or
  264. (UnixCpMap[iconvindex].cp<>cp);
  265. end;
  266. {$ifdef aix}
  267. {$i cwstraix.inc}
  268. {$endif aix}
  269. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  270. var
  271. outlength,
  272. outoffset,
  273. srclen,
  274. outleft : size_t;
  275. use_iconv: iconv_t;
  276. srcpos : pwidechar;
  277. destpos: pchar;
  278. mynil : pchar;
  279. my0 : size_t;
  280. err : longint;
  281. free_iconv: boolean;
  282. {$ifdef aix}
  283. intermediate: rawbytestring;
  284. {$endif aix}
  285. begin
  286. {$ifdef aix}
  287. { AIX libiconv does not support converting cp866 to anything else except
  288. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  289. if cp=866 then
  290. begin
  291. Wide2AnsiMove(source,intermediate,28595,len);
  292. if handle_aix_intermediate(pchar(intermediate),28595,cp,dest,len) then
  293. exit;
  294. end;
  295. {$endif aix}
  296. if (cp=DefaultSystemCodePage) then
  297. begin
  298. { update iconv converter in case the DefaultSystemCodePage has been
  299. changed }
  300. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  301. begin
  302. FiniThread;
  303. InitThread;
  304. end;
  305. use_iconv:=iconv_wide2ansi;
  306. free_iconv:=false;
  307. end
  308. else
  309. begin
  310. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,false);
  311. free_iconv:=true;
  312. end;
  313. { unsupported encoding -> default move }
  314. if use_iconv=iconv_t(-1) then
  315. begin
  316. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  317. exit;
  318. end;
  319. mynil:=nil;
  320. my0:=0;
  321. { rought estimation }
  322. setlength(dest,len*3);
  323. outlength:=len*3;
  324. srclen:=len*2;
  325. srcpos:=source;
  326. destpos:=pchar(dest);
  327. outleft:=outlength;
  328. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  329. begin
  330. err:=fpgetCerrno;
  331. case err of
  332. { last character is incomplete sequence }
  333. ESysEINVAL,
  334. { incomplete sequence in the middle }
  335. ESysEILSEQ:
  336. begin
  337. { skip and set to '?' }
  338. inc(srcpos);
  339. dec(srclen,2);
  340. destpos^:='?';
  341. inc(destpos);
  342. dec(outleft);
  343. { reset }
  344. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  345. if err=ESysEINVAL then
  346. break;
  347. end;
  348. ESysE2BIG:
  349. begin
  350. outoffset:=destpos-pchar(dest);
  351. { extend }
  352. setlength(dest,outlength+len*3);
  353. inc(outleft,len*3);
  354. inc(outlength,len*3);
  355. { string could have been moved }
  356. destpos:=pchar(dest)+outoffset;
  357. end;
  358. else
  359. runerror(231);
  360. end;
  361. end;
  362. // truncate string
  363. setlength(dest,length(dest)-outleft);
  364. SetCodePage(dest,cp,false);
  365. if free_iconv then
  366. iconv_close(use_iconv);
  367. end;
  368. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  369. var
  370. outlength,
  371. outoffset,
  372. outleft : size_t;
  373. use_iconv: iconv_t;
  374. srcpos,
  375. destpos: pchar;
  376. mynil : pchar;
  377. my0 : size_t;
  378. err: cint;
  379. iconvindex: longint;
  380. free_iconv: boolean;
  381. {$ifdef aix}
  382. intermediate: rawbytestring;
  383. {$endif aix}
  384. begin
  385. {$ifdef aix}
  386. { AIX libiconv does not support converting cp866 to anything else except
  387. for iso-8859-5 -> always first convert to iso-8859-5, then to UTF-16 }
  388. if cp=866 then
  389. begin
  390. if handle_aix_intermediate(source,cp,cp,intermediate,len) then
  391. source:=pchar(intermediate);
  392. end;
  393. {$endif aix}
  394. if (cp=DefaultSystemCodePage) then
  395. begin
  396. { update iconv converter in case the DefaultSystemCodePage has been
  397. changed }
  398. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  399. begin
  400. FiniThread;
  401. InitThread;
  402. end;
  403. use_iconv:=iconv_ansi2wide;
  404. free_iconv:=false;
  405. end
  406. else
  407. begin
  408. { TODO: add caching (then we also don't need separate code for
  409. the default system page and other ones)
  410. -- typecasting an ansistring function result to pchar is
  411. unsafe normally, but these are constant strings -> no
  412. problem }
  413. use_iconv:=open_iconv_for_cps(cp,unicode_encoding2,true);
  414. free_iconv:=true;
  415. end;
  416. { unsupported encoding -> default move }
  417. if use_iconv=iconv_t(-1) then
  418. begin
  419. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  420. exit;
  421. end;
  422. mynil:=nil;
  423. my0:=0;
  424. // extra space
  425. outlength:=len+1;
  426. setlength(dest,outlength);
  427. srcpos:=source;
  428. destpos:=pchar(dest);
  429. outleft:=outlength*2;
  430. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  431. begin
  432. err:=fpgetCerrno;
  433. case err of
  434. ESysEINVAL,
  435. ESysEILSEQ:
  436. begin
  437. { skip and set to '?' }
  438. inc(srcpos);
  439. dec(len);
  440. pwidechar(destpos)^:='?';
  441. inc(destpos,2);
  442. dec(outleft,2);
  443. { reset }
  444. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  445. if err=ESysEINVAL then
  446. break;
  447. end;
  448. ESysE2BIG:
  449. begin
  450. outoffset:=destpos-pchar(dest);
  451. { extend }
  452. setlength(dest,outlength+len);
  453. inc(outleft,len*2);
  454. inc(outlength,len);
  455. { string could have been moved }
  456. destpos:=pchar(dest)+outoffset;
  457. end;
  458. else
  459. runerror(231);
  460. end;
  461. end;
  462. // truncate string
  463. setlength(dest,length(dest)-outleft div 2);
  464. if free_iconv then
  465. iconv_close(use_iconv);
  466. end;
  467. function LowerWideString(const s : WideString) : WideString;
  468. var
  469. i : SizeInt;
  470. begin
  471. SetLength(result,length(s));
  472. for i:=0 to length(s)-1 do
  473. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  474. end;
  475. function UpperWideString(const s : WideString) : WideString;
  476. var
  477. i : SizeInt;
  478. begin
  479. SetLength(result,length(s));
  480. for i:=0 to length(s)-1 do
  481. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  482. end;
  483. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  484. begin
  485. if (len>length(s)) then
  486. if (length(s) < 10*256) then
  487. setlength(s,length(s)+10)
  488. else
  489. setlength(s,length(s)+length(s) shr 8);
  490. end;
  491. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  492. begin
  493. EnsureAnsiLen(s,index);
  494. pchar(@s[index])^:=c;
  495. inc(index);
  496. end;
  497. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  498. {$if not(defined(beos) and not defined(haiku))}
  499. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  500. {$else not beos}
  501. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  502. {$endif beos}
  503. var
  504. p : pchar;
  505. mblen : size_t;
  506. begin
  507. { we know that s is unique -> avoid uniquestring calls}
  508. p:=@s[index];
  509. if (nc<=127) then
  510. ConcatCharToAnsiStr(char(nc),s,index)
  511. else
  512. begin
  513. EnsureAnsiLen(s,index+MB_CUR_MAX);
  514. {$if not(defined(beos) and not defined(haiku))}
  515. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  516. {$else not beos}
  517. mblen:=wctomb(p,wchar_t(nc));
  518. {$endif not beos}
  519. if (mblen<>size_t(-1)) then
  520. inc(index,mblen)
  521. else
  522. begin
  523. { invalid wide char }
  524. p^:='?';
  525. inc(index);
  526. end;
  527. end;
  528. end;
  529. function LowerAnsiString(const s : AnsiString) : AnsiString;
  530. var
  531. i, slen,
  532. resindex : SizeInt;
  533. mblen : size_t;
  534. {$if not(defined(beos) and not defined(haiku))}
  535. ombstate,
  536. nmbstate : mbstate_t;
  537. {$endif beos}
  538. wc : wchar_t;
  539. begin
  540. {$if not(defined(beos) and not defined(haiku))}
  541. fillchar(ombstate,sizeof(ombstate),0);
  542. fillchar(nmbstate,sizeof(nmbstate),0);
  543. {$endif beos}
  544. slen:=length(s);
  545. SetLength(result,slen+10);
  546. i:=1;
  547. resindex:=1;
  548. while (i<=slen) do
  549. begin
  550. if (s[i]<=#127) then
  551. begin
  552. wc:=wchar_t(s[i]);
  553. mblen:= 1;
  554. end
  555. else
  556. {$if not(defined(beos) and not defined(haiku))}
  557. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  558. {$else not beos}
  559. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  560. {$endif not beos}
  561. case mblen of
  562. size_t(-2):
  563. begin
  564. { partial invalid character, copy literally }
  565. while (i<=slen) do
  566. begin
  567. ConcatCharToAnsiStr(s[i],result,resindex);
  568. inc(i);
  569. end;
  570. end;
  571. size_t(-1), 0:
  572. begin
  573. { invalid or null character }
  574. ConcatCharToAnsiStr(s[i],result,resindex);
  575. inc(i);
  576. end;
  577. else
  578. begin
  579. { a valid sequence }
  580. { even if mblen = 1, the lowercase version may have a }
  581. { different length }
  582. { We can't do anything special if wchar_t is 16 bit... }
  583. {$if not(defined(beos) and not defined(haiku))}
  584. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  585. {$else not beos}
  586. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  587. {$endif not beos}
  588. inc(i,mblen);
  589. end;
  590. end;
  591. end;
  592. SetLength(result,resindex-1);
  593. end;
  594. function UpperAnsiString(const s : AnsiString) : AnsiString;
  595. var
  596. i, slen,
  597. resindex : SizeInt;
  598. mblen : size_t;
  599. {$if not(defined(beos) and not defined(haiku))}
  600. ombstate,
  601. nmbstate : mbstate_t;
  602. {$endif beos}
  603. wc : wchar_t;
  604. begin
  605. {$if not(defined(beos) and not defined(haiku))}
  606. fillchar(ombstate,sizeof(ombstate),0);
  607. fillchar(nmbstate,sizeof(nmbstate),0);
  608. {$endif beos}
  609. slen:=length(s);
  610. SetLength(result,slen+10);
  611. i:=1;
  612. resindex:=1;
  613. while (i<=slen) do
  614. begin
  615. if (s[i]<=#127) then
  616. begin
  617. wc:=wchar_t(s[i]);
  618. mblen:= 1;
  619. end
  620. else
  621. {$if not(defined(beos) and not defined(haiku))}
  622. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  623. {$else not beos}
  624. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  625. {$endif beos}
  626. case mblen of
  627. size_t(-2):
  628. begin
  629. { partial invalid character, copy literally }
  630. while (i<=slen) do
  631. begin
  632. ConcatCharToAnsiStr(s[i],result,resindex);
  633. inc(i);
  634. end;
  635. end;
  636. size_t(-1), 0:
  637. begin
  638. { invalid or null character }
  639. ConcatCharToAnsiStr(s[i],result,resindex);
  640. inc(i);
  641. end;
  642. else
  643. begin
  644. { a valid sequence }
  645. { even if mblen = 1, the uppercase version may have a }
  646. { different length }
  647. { We can't do anything special if wchar_t is 16 bit... }
  648. {$if not(defined(beos) and not defined(haiku))}
  649. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  650. {$else not beos}
  651. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  652. {$endif not beos}
  653. inc(i,mblen);
  654. end;
  655. end;
  656. end;
  657. SetLength(result,resindex-1);
  658. end;
  659. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  660. var
  661. i, slen,
  662. destindex : SizeInt;
  663. uch : UCS4Char;
  664. begin
  665. slen:=length(s);
  666. setlength(result,slen+1);
  667. i:=1;
  668. destindex:=0;
  669. while (i<=slen) do
  670. begin
  671. uch:=UCS4Char(s[i]);
  672. if (uch=0) then
  673. result[destindex]:=32
  674. else if (uch<=$d7ff) or (uch>=$e000) then
  675. result[destindex]:=uch
  676. else if (uch<=$dbff) and
  677. (i<slen) and
  678. (s[i+1]>=#$dc00) and
  679. (s[i+1]<=#$dfff) then
  680. begin
  681. result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00);
  682. inc(i);
  683. end
  684. else { invalid surrogate pair }
  685. result[destindex]:=uch;
  686. inc(i);
  687. inc(destindex);
  688. end;
  689. result[destindex]:=UCS4Char(0);
  690. { Trimming length in this particular case is just a waste of time,
  691. because result will be interpreted as null-terminated and discarded
  692. almost immediately }
  693. end;
  694. function CompareWideString(const s1, s2 : WideString; Options : TCompareOptions) : PtrInt;
  695. {$if not(defined (aix) and defined(cpupowerpc32))}
  696. var
  697. hs1,hs2 : UCS4String;
  698. us1,us2 : WideString;
  699. begin
  700. { wcscoll interprets null chars as end-of-string -> filter out }
  701. if coIgnoreCase in Options then
  702. begin
  703. us1:=UpperWideString(s1);
  704. us2:=UpperWideString(s2);
  705. end
  706. else
  707. begin
  708. us1:=s1;
  709. us2:=s2;
  710. end;
  711. hs1:=WideStringToUCS4StringNoNulls(us1);
  712. hs2:=WideStringToUCS4StringNoNulls(us2);
  713. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  714. end;
  715. {$else}
  716. { AIX/PPC32 has a 16 bit wchar_t }
  717. var
  718. i, len: longint;
  719. us1,us2 : WideString;
  720. hs1, hs2: array of widechar;
  721. begin
  722. if coIgnoreCase in Options then
  723. begin
  724. us1:=UpperWideString(s1);
  725. us2:=UpperWideString(s2);
  726. end
  727. else
  728. begin
  729. us1:=s1;
  730. us2:=s2;
  731. end;
  732. len:=length(us1);
  733. setlength(hs1,len+1);
  734. for i:=1 to len do
  735. if us1[i]<>#0 then
  736. hs1[i-1]:=us1[i]
  737. else
  738. hs1[i-1]:=#32;
  739. hs1[len]:=#0;
  740. len:=length(us2);
  741. setlength(hs2,len+1);
  742. for i:=1 to len do
  743. if us2[i]<>#0 then
  744. hs2[i-1]:=us2[i]
  745. else
  746. hs2[i-1]:=#32;
  747. hs2[len]:=#0;
  748. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  749. end;
  750. {$endif}
  751. { return value: number of code points in the string. Whenever an invalid
  752. code point is encountered, all characters part of this invalid code point
  753. are considered to form one "character" and the next character is
  754. considered to be the start of a new (possibly also invalid) code point }
  755. function CharLengthPChar(const Str: PChar): PtrInt;
  756. var
  757. nextlen: ptrint;
  758. s: pchar;
  759. {$if not(defined(beos) and not defined(haiku))}
  760. mbstate: mbstate_t;
  761. {$endif not beos}
  762. begin
  763. result:=0;
  764. s:=str;
  765. {$if not(defined(beos) and not defined(haiku))}
  766. fillchar(mbstate,sizeof(mbstate),0);
  767. {$endif not beos}
  768. repeat
  769. {$if defined(beos) and not defined(haiku)}
  770. nextlen:=ptrint(mblen(s,MB_CUR_MAX));
  771. {$else beos}
  772. nextlen:=ptrint(mbrlen(s,MB_CUR_MAX,@mbstate));
  773. {$endif beos}
  774. { skip invalid/incomplete sequences }
  775. if (nextlen<0) then
  776. nextlen:=1;
  777. inc(result,1);
  778. inc(s,nextlen);
  779. until (nextlen=0);
  780. end;
  781. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  782. {$if not(defined(beos) and not defined(haiku))}
  783. var
  784. mbstate: mbstate_t;
  785. {$endif not beos}
  786. begin
  787. {$if defined(beos) and not defined(haiku)}
  788. result:=ptrint(mblen(str,maxlookahead));
  789. {$else beos}
  790. fillchar(mbstate,sizeof(mbstate),0);
  791. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  792. { mbrlen can also return -2 for "incomplete but potially valid character
  793. and data has been processed" }
  794. if result<0 then
  795. result:=-1;
  796. {$endif beos}
  797. end;
  798. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  799. var
  800. a,b: pchar;
  801. i: PtrInt;
  802. begin
  803. if not(canmodifys1) then
  804. getmem(a,len1+1)
  805. else
  806. a:=s1;
  807. for i:=0 to len1-1 do
  808. if s1[i]<>#0 then
  809. a[i]:=s1[i]
  810. else
  811. a[i]:=#32;
  812. a[len1]:=#0;
  813. if not(canmodifys2) then
  814. getmem(b,len2+1)
  815. else
  816. b:=s2;
  817. for i:=0 to len2-1 do
  818. if s2[i]<>#0 then
  819. b[i]:=s2[i]
  820. else
  821. b[i]:=#32;
  822. b[len2]:=#0;
  823. result:=strcoll(a,b);
  824. if not(canmodifys1) then
  825. freemem(a);
  826. if not(canmodifys2) then
  827. freemem(b);
  828. end;
  829. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  830. begin
  831. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  832. end;
  833. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  834. begin
  835. result:=strcoll(s1,s2);
  836. end;
  837. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  838. var
  839. a, b: AnsiString;
  840. begin
  841. a:=UpperAnsistring(s1);
  842. b:=UpperAnsistring(s2);
  843. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  844. end;
  845. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  846. begin
  847. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  848. end;
  849. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  850. var
  851. a, b: pchar;
  852. begin
  853. if (maxlen=0) then
  854. exit(0);
  855. if (s1[maxlen]<>#0) then
  856. begin
  857. getmem(a,maxlen+1);
  858. move(s1^,a^,maxlen);
  859. a[maxlen]:=#0;
  860. end
  861. else
  862. a:=s1;
  863. if (s2[maxlen]<>#0) then
  864. begin
  865. getmem(b,maxlen+1);
  866. move(s2^,b^,maxlen);
  867. b[maxlen]:=#0;
  868. end
  869. else
  870. b:=s2;
  871. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  872. if (a<>s1) then
  873. freemem(a);
  874. if (b<>s2) then
  875. freemem(b);
  876. end;
  877. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  878. var
  879. a, b: ansistring;
  880. begin
  881. if (maxlen=0) then
  882. exit(0);
  883. setlength(a,maxlen);
  884. move(s1^,a[1],maxlen);
  885. setlength(b,maxlen);
  886. move(s2^,b[1],maxlen);
  887. result:=AnsiCompareText(a,b);
  888. end;
  889. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  890. var
  891. newlen: sizeint;
  892. begin
  893. newlen:=length(s);
  894. if newlen>strlen(orgp) then
  895. fpc_rangeerror;
  896. p:=orgp;
  897. if (newlen>0) then
  898. move(s[1],p[0],newlen);
  899. p[newlen]:=#0;
  900. end;
  901. function AnsiStrLower(Str: PChar): PChar;
  902. var
  903. temp: ansistring;
  904. begin
  905. temp:=loweransistring(str);
  906. ansi2pchar(temp,str,result);
  907. end;
  908. function AnsiStrUpper(Str: PChar): PChar;
  909. var
  910. temp: ansistring;
  911. begin
  912. temp:=upperansistring(str);
  913. ansi2pchar(temp,str,result);
  914. end;
  915. function envvarset(const varname: pchar): boolean;
  916. var
  917. varval: pchar;
  918. begin
  919. varval:=fpgetenv(varname);
  920. result:=
  921. assigned(varval) and
  922. (varval[0]<>#0);
  923. end;
  924. function GetStandardCodePage(const stdcp: TStandardCodePageEnum): TSystemCodePage;
  925. var
  926. langinfo: pchar;
  927. begin
  928. {$ifdef FPCRTL_FILESYSTEM_UTF8}
  929. if stdcp=scpFileSystemSingleByte then
  930. begin
  931. result:=CP_UTF8;
  932. exit;
  933. end;
  934. {$endif}
  935. { if none of the relevant LC_* environment variables are set, fall back to
  936. UTF-8 (this happens under some versions of OS X for GUI applications, which
  937. otherwise get CP_ASCII) }
  938. if envvarset('LC_ALL') or
  939. envvarset('LC_CTYPE') or
  940. envvarset('LANG') then
  941. begin
  942. langinfo:=nl_langinfo(CODESET);
  943. { there's a bug in the Mac OS X 10.5 libc (based on FreeBSD's)
  944. that causes it to return an empty string of UTF-8 locales
  945. -> patch up (and in general, UTF-8 is a good default on
  946. Unix platforms) }
  947. if not assigned(langinfo) or
  948. (langinfo^=#0) then
  949. langinfo:='UTF-8';
  950. Result:=GetCodepageByName(ansistring(langinfo));
  951. end
  952. else
  953. Result:=unixcp.GetSystemCodepage;
  954. end;
  955. {$ifdef FPC_HAS_CPSTRING}
  956. procedure SetStdIOCodePage(var T: Text); inline;
  957. begin
  958. case TextRec(T).Mode of
  959. fmInput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleInput);
  960. fmOutput:TextRec(T).CodePage:=GetStandardCodePage(scpConsoleOutput);
  961. end;
  962. end;
  963. procedure SetStdIOCodePages; inline;
  964. begin
  965. SetStdIOCodePage(Input);
  966. SetStdIOCodePage(Output);
  967. SetStdIOCodePage(ErrOutput);
  968. SetStdIOCodePage(StdOut);
  969. SetStdIOCodePage(StdErr);
  970. end;
  971. {$endif FPC_HAS_CPSTRING}
  972. var
  973. OrgWideStringManager: TUnicodeStringManager;
  974. Procedure SetCWideStringManager;
  975. Var
  976. CWideStringManager : TUnicodeStringManager;
  977. begin
  978. GetUnicodeStringManager(OrgWideStringManager);
  979. CWideStringManager:=OrgWideStringManager;
  980. With CWideStringManager do
  981. begin
  982. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  983. Ansi2WideMoveProc:=@Ansi2WideMove;
  984. UpperWideStringProc:=@UpperWideString;
  985. LowerWideStringProc:=@LowerWideString;
  986. CompareWideStringProc:=@CompareWideString;
  987. // CompareTextWideStringProc:=@CompareTextWideString;
  988. CharLengthPCharProc:=@CharLengthPChar;
  989. CodePointLengthProc:=@CodePointLength;
  990. UpperAnsiStringProc:=@UpperAnsiString;
  991. LowerAnsiStringProc:=@LowerAnsiString;
  992. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  993. CompareTextAnsiStringProc:=@AnsiCompareText;
  994. StrCompAnsiStringProc:=@StrCompAnsi;
  995. StrICompAnsiStringProc:=@AnsiStrIComp;
  996. StrLCompAnsiStringProc:=@AnsiStrLComp;
  997. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  998. StrLowerAnsiStringProc:=@AnsiStrLower;
  999. StrUpperAnsiStringProc:=@AnsiStrUpper;
  1000. ThreadInitProc:=@InitThread;
  1001. ThreadFiniProc:=@FiniThread;
  1002. { Unicode }
  1003. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  1004. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  1005. UpperUnicodeStringProc:=@UpperWideString;
  1006. LowerUnicodeStringProc:=@LowerWideString;
  1007. CompareUnicodeStringProc:=@CompareWideString;
  1008. { CodePage }
  1009. GetStandardCodePageProc:=@GetStandardCodePage;
  1010. end;
  1011. SetUnicodeStringManager(CWideStringManager);
  1012. end;
  1013. var
  1014. iconvlib:TLibHandle;
  1015. initialization
  1016. SetCWideStringManager;
  1017. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  1018. { with the information from the environment variables according to POSIX }
  1019. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  1020. setlocale(LC_ALL,'');
  1021. { load iconv library }
  1022. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix);
  1023. if iconvlib=0 then
  1024. iconvlib:=LoadLibrary(libprefix+libiconvname+'.'+SharedSuffix+'.6');
  1025. { set the DefaultSystemCodePage }
  1026. DefaultSystemCodePage:=GetStandardCodePage(scpAnsi);
  1027. DefaultFileSystemCodePage:=GetStandardCodePage(scpFileSystemSingleByte);
  1028. DefaultRTLFileSystemCodePage:=DefaultFileSystemCodePage;
  1029. {$ifdef FPC_HAS_CPSTRING}
  1030. SetStdIOCodePages;
  1031. {$endif FPC_HAS_CPSTRING}
  1032. { init conversion tables for main program }
  1033. InitThread;
  1034. finalization
  1035. { fini conversion tables for main program }
  1036. FiniThread;
  1037. { unload iconv library }
  1038. if iconvlib<>0 then
  1039. FreeLibrary(iconvlib);
  1040. { restore previous (probably default) widestring manager so that subsequent calls
  1041. into the widestring manager won't trigger the finalized functionality }
  1042. SetWideStringManager(OrgWideStringManager);
  1043. end.