cwstring.pp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. unit cwstring;
  15. interface
  16. procedure SetCWidestringManager;
  17. implementation
  18. {$linklib c}
  19. {$if not defined(linux) and not defined(solaris)} // Linux (and maybe glibc platforms in general), have iconv in glibc.
  20. {$if defined(haiku)}
  21. {$linklib textencoding}
  22. {$linklib locale}
  23. {$else}
  24. {$linklib iconv}
  25. {$endif}
  26. {$define useiconv}
  27. {$endif linux}
  28. Uses
  29. BaseUnix,
  30. ctypes,
  31. unix,
  32. unixtype,
  33. initc;
  34. Const
  35. {$ifndef useiconv}
  36. libiconvname='c'; // is in libc under Linux.
  37. {$else}
  38. {$ifdef haiku}
  39. libiconvname='textencoding'; // is in libtextencoding under Haiku
  40. {$else}
  41. libiconvname='iconv';
  42. {$endif}
  43. {$endif}
  44. { helper functions from libc }
  45. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  46. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  47. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  48. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  49. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  50. {$ifndef beos}
  51. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  52. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  53. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  54. {$else beos}
  55. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  56. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  57. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  58. {$endif beos}
  59. const
  60. {$ifdef linux}
  61. __LC_CTYPE = 0;
  62. LC_ALL = 6;
  63. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  64. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  65. CODESET = _NL_CTYPE_CODESET_NAME;
  66. {$else linux}
  67. {$ifdef darwin}
  68. CODESET = 0;
  69. LC_ALL = 0;
  70. {$else darwin}
  71. {$ifdef FreeBSD} // actually FreeBSD5. internationalisation is afaik not default on 4.
  72. __LC_CTYPE = 0;
  73. LC_ALL = 0;
  74. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  75. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  76. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  77. {$else freebsd}
  78. {$ifdef solaris}
  79. CODESET=49;
  80. LC_ALL = 6;
  81. {$else solaris}
  82. {$ifdef beos}
  83. {$warning check correct value for BeOS}
  84. CODESET=49;
  85. {$ifdef haiku}
  86. LC_ALL = 0; // Checked for Haiku
  87. {$else}
  88. LC_ALL = 6; // Checked for BeOS
  89. {$endif}
  90. ESysEILSEQ = EILSEQ;
  91. {$else}
  92. {$ifdef OpenBSD}
  93. CODESET = 51;
  94. LC_ALL = 0;
  95. {$else not OpenBSD}
  96. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  97. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  98. {$endif OpenBSD}
  99. {$endif beos}
  100. {$endif solaris}
  101. {$endif FreeBSD}
  102. {$endif darwin}
  103. {$endif linux}
  104. { unicode encoding name }
  105. {$ifdef FPC_LITTLE_ENDIAN}
  106. unicode_encoding2 = 'UTF-16LE';
  107. unicode_encoding4 = 'UCS-4LE';
  108. {$else FPC_LITTLE_ENDIAN}
  109. unicode_encoding2 = 'UTF-16BE';
  110. unicode_encoding4 = 'UCS-4BE';
  111. {$endif FPC_LITTLE_ENDIAN}
  112. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  113. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  114. { that's a libc macro mapped to internal functions/variables }
  115. { and thus not a stable external API on systems where libc }
  116. { breaks backwards compatibility every now and then }
  117. MB_CUR_MAX = 10;
  118. { Requests for iconvctl }
  119. ICONV_TRIVIALP = 0; // int *argument
  120. ICONV_GET_TRANSLITERATE = 1; // int *argument
  121. ICONV_SET_TRANSLITERATE = 2; // const int *argument
  122. ICONV_GET_DISCARD_ILSEQ = 3; // int *argument
  123. ICONV_SET_DISCARD_ILSEQ = 4; // const int *argument
  124. ICONV_SET_HOOKS = 5; // const struct iconv_hooks *argument
  125. ICONV_SET_FALLBACKS = 6; // const struct iconv_fallbacks *argument
  126. type
  127. piconv_t = ^iconv_t;
  128. iconv_t = pointer;
  129. nl_item = cint;
  130. {$ifdef haiku}
  131. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'locale' name 'nl_langinfo';
  132. {$else}
  133. {$ifndef beos}
  134. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  135. {$endif}
  136. {$endif}
  137. {$if (not defined(bsd) and not defined(beos)) or (defined(darwin) and not defined(cpupowerpc32))}
  138. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  139. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  140. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  141. function iconvctl(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;external libiconvname name 'iconvctl';
  142. {$else}
  143. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  144. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  145. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  146. function iconvctl(__cd:iconv_t; __request:cint; __argument:pointer):cint;cdecl;external libiconvname name 'libiconvctl';
  147. {$endif}
  148. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  149. threadvar
  150. iconv_ansi2wide,
  151. iconv_wide2ansi : iconv_t;
  152. { since we cache the iconv_t converters, we have to do the same
  153. for the DefaultSystemCodePage variable since if it changes, we
  154. have to re-initialize the converters too. We can't do that via
  155. a callback in the widestring manager because DefaultSystemCodePage
  156. is not a threadvar and we can't automatically change this in all
  157. threads }
  158. current_DefaultSystemCodePage: TSystemCodePage;
  159. function win2iconv(cp: word): rawbytestring; forward;
  160. procedure InitThread;
  161. var
  162. transliterate: cint;
  163. {$if not(defined(darwin) and defined(cpuarm))}
  164. iconvname: rawbytestring;
  165. {$endif}
  166. begin
  167. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  168. {$if not(defined(darwin) and defined(cpuarm))}
  169. iconvname:=win2iconv(DefaultSystemCodePage);
  170. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  171. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  172. {$else}
  173. { Unix locale settings are ignored on iPhoneOS }
  174. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  175. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  176. {$endif}
  177. transliterate:=1;
  178. iconvctl(iconv_wide2ansi,ICONV_SET_TRANSLITERATE,@transliterate);
  179. end;
  180. procedure FiniThread;
  181. begin
  182. if (iconv_wide2ansi <> iconv_t(-1)) then
  183. iconv_close(iconv_wide2ansi);
  184. if (iconv_ansi2wide <> iconv_t(-1)) then
  185. iconv_close(iconv_ansi2wide);
  186. end;
  187. {$i winiconv}
  188. {$if defined(beos) and not defined(haiku)}
  189. function nl_langinfo(__item:nl_item):pchar;
  190. begin
  191. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  192. // Now implement the minimum required to correctly initialize WideString support
  193. case __item of
  194. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  195. else
  196. begin
  197. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  198. Result := '';
  199. end;
  200. end;
  201. end;
  202. {$endif}
  203. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  204. var
  205. outlength,
  206. outoffset,
  207. srclen,
  208. outleft : size_t;
  209. use_iconv: iconv_t;
  210. srcpos : pwidechar;
  211. destpos: pchar;
  212. mynil : pchar;
  213. my0 : size_t;
  214. err,
  215. transliterate: cint;
  216. free_iconv: boolean;
  217. begin
  218. if (cp=DefaultSystemCodePage) then
  219. begin
  220. { update iconv converter in case the DefaultSystemCodePage has been
  221. changed }
  222. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  223. begin
  224. FiniThread;
  225. InitThread;
  226. end;
  227. use_iconv:=iconv_wide2ansi;
  228. free_iconv:=false;
  229. end
  230. else
  231. begin
  232. { TODO: add caching (then we also don't need separate code for
  233. the default system page and other ones)
  234. -- typecasting an ansistring function result to pchar is
  235. unsafe normally, but these are constant strings -> no
  236. problem }
  237. use_iconv:=iconv_open(pchar(win2iconv(cp)),unicode_encoding2);
  238. transliterate:=1;
  239. iconvctl(use_iconv,ICONV_SET_TRANSLITERATE,@transliterate);
  240. free_iconv:=true;
  241. end;
  242. { unsupported encoding -> default move }
  243. if use_iconv=iconv_t(-1) then
  244. begin
  245. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  246. exit;
  247. end;
  248. mynil:=nil;
  249. my0:=0;
  250. { rought estimation }
  251. setlength(dest,len*3);
  252. SetCodePage(dest,cp,false);
  253. outlength:=len*3;
  254. srclen:=len*2;
  255. srcpos:=source;
  256. destpos:=pchar(dest);
  257. outleft:=outlength;
  258. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  259. begin
  260. err:=fpgetCerrno;
  261. case err of
  262. { last character is incomplete sequence }
  263. ESysEINVAL,
  264. { incomplete sequence in the middle }
  265. ESysEILSEQ:
  266. begin
  267. { skip and set to '?' }
  268. inc(srcpos);
  269. dec(srclen,2);
  270. destpos^:='?';
  271. inc(destpos);
  272. dec(outleft);
  273. { reset }
  274. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  275. if err=ESysEINVAL then
  276. break;
  277. end;
  278. ESysE2BIG:
  279. begin
  280. outoffset:=destpos-pchar(dest);
  281. { extend }
  282. setlength(dest,outlength+len*3);
  283. inc(outleft,len*3);
  284. inc(outlength,len*3);
  285. { string could have been moved }
  286. destpos:=pchar(dest)+outoffset;
  287. end;
  288. else
  289. runerror(231);
  290. end;
  291. end;
  292. // truncate string
  293. setlength(dest,length(dest)-outleft);
  294. if free_iconv then
  295. iconv_close(use_iconv);
  296. end;
  297. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  298. var
  299. outlength,
  300. outoffset,
  301. outleft : size_t;
  302. use_iconv: iconv_t;
  303. srcpos,
  304. destpos: pchar;
  305. mynil : pchar;
  306. my0 : size_t;
  307. err: cint;
  308. free_iconv: boolean;
  309. begin
  310. if (cp=DefaultSystemCodePage) then
  311. begin
  312. { update iconv converter in case the DefaultSystemCodePage has been
  313. changed }
  314. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  315. begin
  316. FiniThread;
  317. InitThread;
  318. end;
  319. use_iconv:=iconv_ansi2wide;
  320. free_iconv:=false;
  321. end
  322. else
  323. begin
  324. { TODO: add caching (then we also don't need separate code for
  325. the default system page and other ones)
  326. -- typecasting an ansistring function result to pchar is
  327. unsafe normally, but these are constant strings -> no
  328. problem }
  329. use_iconv:=iconv_open(unicode_encoding2,pchar(win2iconv(cp)));
  330. free_iconv:=true;
  331. end;
  332. { unsupported encoding -> default move }
  333. if use_iconv=iconv_t(-1) then
  334. begin
  335. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  336. exit;
  337. end;
  338. mynil:=nil;
  339. my0:=0;
  340. // extra space
  341. outlength:=len+1;
  342. setlength(dest,outlength);
  343. srcpos:=source;
  344. destpos:=pchar(dest);
  345. outleft:=outlength*2;
  346. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  347. begin
  348. err:=fpgetCerrno;
  349. case err of
  350. ESysEINVAL,
  351. ESysEILSEQ:
  352. begin
  353. { skip and set to '?' }
  354. inc(srcpos);
  355. dec(len);
  356. pwidechar(destpos)^:='?';
  357. inc(destpos,2);
  358. dec(outleft,2);
  359. { reset }
  360. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  361. if err=ESysEINVAL then
  362. break;
  363. end;
  364. ESysE2BIG:
  365. begin
  366. outoffset:=destpos-pchar(dest);
  367. { extend }
  368. setlength(dest,outlength+len);
  369. inc(outleft,len*2);
  370. inc(outlength,len);
  371. { string could have been moved }
  372. destpos:=pchar(dest)+outoffset;
  373. end;
  374. else
  375. runerror(231);
  376. end;
  377. end;
  378. // truncate string
  379. setlength(dest,length(dest)-outleft div 2);
  380. if free_iconv then
  381. iconv_close(use_iconv);
  382. end;
  383. function LowerWideString(const s : WideString) : WideString;
  384. var
  385. i : SizeInt;
  386. begin
  387. SetLength(result,length(s));
  388. for i:=0 to length(s)-1 do
  389. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  390. end;
  391. function UpperWideString(const s : WideString) : WideString;
  392. var
  393. i : SizeInt;
  394. begin
  395. SetLength(result,length(s));
  396. for i:=0 to length(s)-1 do
  397. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  398. end;
  399. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  400. begin
  401. if (len>length(s)) then
  402. if (length(s) < 10*256) then
  403. setlength(s,length(s)+10)
  404. else
  405. setlength(s,length(s)+length(s) shr 8);
  406. end;
  407. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  408. begin
  409. EnsureAnsiLen(s,index);
  410. pchar(@s[index])^:=c;
  411. inc(index);
  412. end;
  413. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  414. {$ifndef beos}
  415. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  416. {$else not beos}
  417. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  418. {$endif beos}
  419. var
  420. p : pchar;
  421. mblen : size_t;
  422. begin
  423. { we know that s is unique -> avoid uniquestring calls}
  424. p:=@s[index];
  425. if (nc<=127) then
  426. ConcatCharToAnsiStr(char(nc),s,index)
  427. else
  428. begin
  429. EnsureAnsiLen(s,index+MB_CUR_MAX);
  430. {$ifndef beos}
  431. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  432. {$else not beos}
  433. mblen:=wctomb(p,wchar_t(nc));
  434. {$endif not beos}
  435. if (mblen<>size_t(-1)) then
  436. inc(index,mblen)
  437. else
  438. begin
  439. { invalid wide char }
  440. p^:='?';
  441. inc(index);
  442. end;
  443. end;
  444. end;
  445. function LowerAnsiString(const s : AnsiString) : AnsiString;
  446. var
  447. i, slen,
  448. resindex : SizeInt;
  449. mblen : size_t;
  450. {$ifndef beos}
  451. ombstate,
  452. nmbstate : mbstate_t;
  453. {$endif beos}
  454. wc : wchar_t;
  455. begin
  456. {$ifndef beos}
  457. fillchar(ombstate,sizeof(ombstate),0);
  458. fillchar(nmbstate,sizeof(nmbstate),0);
  459. {$endif beos}
  460. slen:=length(s);
  461. SetLength(result,slen+10);
  462. i:=1;
  463. resindex:=1;
  464. while (i<=slen) do
  465. begin
  466. if (s[i]<=#127) then
  467. begin
  468. wc:=wchar_t(s[i]);
  469. mblen:= 1;
  470. end
  471. else
  472. {$ifndef beos}
  473. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  474. {$else not beos}
  475. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  476. {$endif not beos}
  477. case mblen of
  478. size_t(-2):
  479. begin
  480. { partial invalid character, copy literally }
  481. while (i<=slen) do
  482. begin
  483. ConcatCharToAnsiStr(s[i],result,resindex);
  484. inc(i);
  485. end;
  486. end;
  487. size_t(-1), 0:
  488. begin
  489. { invalid or null character }
  490. ConcatCharToAnsiStr(s[i],result,resindex);
  491. inc(i);
  492. end;
  493. else
  494. begin
  495. { a valid sequence }
  496. { even if mblen = 1, the lowercase version may have a }
  497. { different length }
  498. { We can't do anything special if wchar_t is 16 bit... }
  499. {$ifndef beos}
  500. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  501. {$else not beos}
  502. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  503. {$endif not beos}
  504. inc(i,mblen);
  505. end;
  506. end;
  507. end;
  508. SetLength(result,resindex-1);
  509. end;
  510. function UpperAnsiString(const s : AnsiString) : AnsiString;
  511. var
  512. i, slen,
  513. resindex : SizeInt;
  514. mblen : size_t;
  515. {$ifndef beos}
  516. ombstate,
  517. nmbstate : mbstate_t;
  518. {$endif beos}
  519. wc : wchar_t;
  520. begin
  521. {$ifndef beos}
  522. fillchar(ombstate,sizeof(ombstate),0);
  523. fillchar(nmbstate,sizeof(nmbstate),0);
  524. {$endif beos}
  525. slen:=length(s);
  526. SetLength(result,slen+10);
  527. i:=1;
  528. resindex:=1;
  529. while (i<=slen) do
  530. begin
  531. if (s[i]<=#127) then
  532. begin
  533. wc:=wchar_t(s[i]);
  534. mblen:= 1;
  535. end
  536. else
  537. {$ifndef beos}
  538. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  539. {$else not beos}
  540. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  541. {$endif beos}
  542. case mblen of
  543. size_t(-2):
  544. begin
  545. { partial invalid character, copy literally }
  546. while (i<=slen) do
  547. begin
  548. ConcatCharToAnsiStr(s[i],result,resindex);
  549. inc(i);
  550. end;
  551. end;
  552. size_t(-1), 0:
  553. begin
  554. { invalid or null character }
  555. ConcatCharToAnsiStr(s[i],result,resindex);
  556. inc(i);
  557. end;
  558. else
  559. begin
  560. { a valid sequence }
  561. { even if mblen = 1, the uppercase version may have a }
  562. { different length }
  563. { We can't do anything special if wchar_t is 16 bit... }
  564. {$ifndef beos}
  565. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  566. {$else not beos}
  567. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  568. {$endif not beos}
  569. inc(i,mblen);
  570. end;
  571. end;
  572. end;
  573. SetLength(result,resindex-1);
  574. end;
  575. function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char; external name 'FPC_UTF16TOUTF32';
  576. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  577. var
  578. i, slen,
  579. destindex : SizeInt;
  580. len : longint;
  581. uch : UCS4Char;
  582. begin
  583. slen:=length(s);
  584. setlength(result,slen+1);
  585. i:=1;
  586. destindex:=0;
  587. while (i<=slen) do
  588. begin
  589. uch:=utf16toutf32(s,i,len);
  590. if (uch=UCS4Char(0)) then
  591. uch:=UCS4Char(32);
  592. result[destindex]:=uch;
  593. inc(destindex);
  594. inc(i,len);
  595. end;
  596. result[destindex]:=UCS4Char(0);
  597. { destindex <= slen }
  598. setlength(result,destindex+1);
  599. end;
  600. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  601. var
  602. hs1,hs2 : UCS4String;
  603. begin
  604. { wcscoll interprets null chars as end-of-string -> filter out }
  605. hs1:=WideStringToUCS4StringNoNulls(s1);
  606. hs2:=WideStringToUCS4StringNoNulls(s2);
  607. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  608. end;
  609. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  610. begin
  611. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  612. end;
  613. function CharLengthPChar(const Str: PChar): PtrInt;
  614. var
  615. nextlen: ptrint;
  616. s: pchar;
  617. {$ifndef beos}
  618. mbstate: mbstate_t;
  619. {$endif not beos}
  620. begin
  621. result:=0;
  622. s:=str;
  623. {$ifndef beos}
  624. fillchar(mbstate,sizeof(mbstate),0);
  625. {$endif not beos}
  626. repeat
  627. {$ifdef beos}
  628. nextlen:=ptrint(mblen(str,MB_CUR_MAX));
  629. {$else beos}
  630. nextlen:=ptrint(mbrlen(str,MB_CUR_MAX,@mbstate));
  631. {$endif beos}
  632. { skip invalid/incomplete sequences }
  633. if (nextlen<0) then
  634. nextlen:=1;
  635. inc(result,nextlen);
  636. inc(s,nextlen);
  637. until (nextlen=0);
  638. end;
  639. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  640. var
  641. nextlen: ptrint;
  642. {$ifndef beos}
  643. mbstate: mbstate_t;
  644. {$endif not beos}
  645. begin
  646. {$ifdef beos}
  647. result:=ptrint(mblen(str,maxlookahead));
  648. {$else beos}
  649. fillchar(mbstate,sizeof(mbstate),0);
  650. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  651. { mbrlen can also return -2 for "incomplete but potially valid character
  652. and data has been processed" }
  653. if result<0 then
  654. result:=-1;
  655. {$endif beos}
  656. end;
  657. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  658. var
  659. a,b: pchar;
  660. i: PtrInt;
  661. begin
  662. if not(canmodifys1) then
  663. getmem(a,len1+1)
  664. else
  665. a:=s1;
  666. for i:=0 to len1-1 do
  667. if s1[i]<>#0 then
  668. a[i]:=s1[i]
  669. else
  670. a[i]:=#32;
  671. a[len1]:=#0;
  672. if not(canmodifys2) then
  673. getmem(b,len2+1)
  674. else
  675. b:=s2;
  676. for i:=0 to len2-1 do
  677. if s2[i]<>#0 then
  678. b[i]:=s2[i]
  679. else
  680. b[i]:=#32;
  681. b[len2]:=#0;
  682. result:=strcoll(a,b);
  683. if not(canmodifys1) then
  684. freemem(a);
  685. if not(canmodifys2) then
  686. freemem(b);
  687. end;
  688. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  689. begin
  690. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  691. end;
  692. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  693. begin
  694. result:=strcoll(s1,s2);
  695. end;
  696. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  697. var
  698. a, b: AnsiString;
  699. begin
  700. a:=UpperAnsistring(s1);
  701. b:=UpperAnsistring(s2);
  702. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  703. end;
  704. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  705. begin
  706. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  707. end;
  708. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  709. var
  710. a, b: pchar;
  711. begin
  712. if (maxlen=0) then
  713. exit(0);
  714. if (s1[maxlen]<>#0) then
  715. begin
  716. getmem(a,maxlen+1);
  717. move(s1^,a^,maxlen);
  718. a[maxlen]:=#0;
  719. end
  720. else
  721. a:=s1;
  722. if (s2[maxlen]<>#0) then
  723. begin
  724. getmem(b,maxlen+1);
  725. move(s2^,b^,maxlen);
  726. b[maxlen]:=#0;
  727. end
  728. else
  729. b:=s2;
  730. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  731. if (a<>s1) then
  732. freemem(a);
  733. if (b<>s2) then
  734. freemem(b);
  735. end;
  736. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  737. var
  738. a, b: ansistring;
  739. begin
  740. if (maxlen=0) then
  741. exit(0);
  742. setlength(a,maxlen);
  743. move(s1^,a[1],maxlen);
  744. setlength(b,maxlen);
  745. move(s2^,b[1],maxlen);
  746. result:=AnsiCompareText(a,b);
  747. end;
  748. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  749. var
  750. newlen: sizeint;
  751. begin
  752. newlen:=length(s);
  753. if newlen>strlen(orgp) then
  754. fpc_rangeerror;
  755. p:=orgp;
  756. if (newlen>0) then
  757. move(s[1],p[0],newlen);
  758. p[newlen]:=#0;
  759. end;
  760. function AnsiStrLower(Str: PChar): PChar;
  761. var
  762. temp: ansistring;
  763. begin
  764. temp:=loweransistring(str);
  765. ansi2pchar(temp,str,result);
  766. end;
  767. function AnsiStrUpper(Str: PChar): PChar;
  768. var
  769. temp: ansistring;
  770. begin
  771. temp:=upperansistring(str);
  772. ansi2pchar(temp,str,result);
  773. end;
  774. Procedure SetCWideStringManager;
  775. Var
  776. CWideStringManager : TUnicodeStringManager;
  777. begin
  778. CWideStringManager:=widestringmanager;
  779. With CWideStringManager do
  780. begin
  781. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  782. Ansi2WideMoveProc:=@Ansi2WideMove;
  783. UpperWideStringProc:=@UpperWideString;
  784. LowerWideStringProc:=@LowerWideString;
  785. CompareWideStringProc:=@CompareWideString;
  786. CompareTextWideStringProc:=@CompareTextWideString;
  787. CharLengthPCharProc:=@CharLengthPChar;
  788. CodePointLengthProc:=@CodePointLength;
  789. UpperAnsiStringProc:=@UpperAnsiString;
  790. LowerAnsiStringProc:=@LowerAnsiString;
  791. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  792. CompareTextAnsiStringProc:=@AnsiCompareText;
  793. StrCompAnsiStringProc:=@StrCompAnsi;
  794. StrICompAnsiStringProc:=@AnsiStrIComp;
  795. StrLCompAnsiStringProc:=@AnsiStrLComp;
  796. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  797. StrLowerAnsiStringProc:=@AnsiStrLower;
  798. StrUpperAnsiStringProc:=@AnsiStrUpper;
  799. ThreadInitProc:=@InitThread;
  800. ThreadFiniProc:=@FiniThread;
  801. { Unicode }
  802. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  803. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  804. UpperUnicodeStringProc:=@UpperWideString;
  805. LowerUnicodeStringProc:=@LowerWideString;
  806. CompareUnicodeStringProc:=@CompareWideString;
  807. CompareTextUnicodeStringProc:=@CompareTextWideString;
  808. end;
  809. SetUnicodeStringManager(CWideStringManager);
  810. end;
  811. initialization
  812. SetCWideStringManager;
  813. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  814. { with the information from the environment variables according to POSIX }
  815. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  816. setlocale(LC_ALL,'');
  817. { set the DefaultSystemCodePage }
  818. DefaultSystemCodePage:=iconv2win(ansistring(nl_langinfo(CODESET)));
  819. { init conversion tables for main program }
  820. InitThread;
  821. finalization
  822. { fini conversion tables for main program }
  823. FiniThread;
  824. end.