cwstring.pp 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. unit cwstring;
  15. interface
  16. procedure SetCWidestringManager;
  17. implementation
  18. {$linklib c}
  19. {$if not defined(linux) and not defined(solaris)} // Linux (and maybe glibc platforms in general), have iconv in glibc.
  20. {$if defined(haiku)}
  21. {$linklib textencoding}
  22. {$linklib locale}
  23. {$else}
  24. {$linklib iconv}
  25. {$endif}
  26. {$define useiconv}
  27. {$endif linux}
  28. Uses
  29. BaseUnix,
  30. ctypes,
  31. unix,
  32. unixtype,
  33. initc;
  34. Const
  35. {$ifndef useiconv}
  36. libiconvname='c'; // is in libc under Linux.
  37. {$else}
  38. {$ifdef haiku}
  39. libiconvname='textencoding'; // is in libtextencoding under Haiku
  40. {$else}
  41. libiconvname='iconv';
  42. {$endif}
  43. {$endif}
  44. { helper functions from libc }
  45. function towlower(__wc:wint_t):wint_t;cdecl;external clib name 'towlower';
  46. function towupper(__wc:wint_t):wint_t;cdecl;external clib name 'towupper';
  47. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external clib name 'wcscoll';
  48. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external clib name 'strcoll';
  49. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  50. {$ifndef beos}
  51. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  52. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  53. function mbrlen(const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrlen';
  54. {$else beos}
  55. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  56. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  57. function mblen(const s: pchar; n: size_t): size_t; cdecl; external clib name 'mblen';
  58. {$endif beos}
  59. const
  60. {$ifdef linux}
  61. __LC_CTYPE = 0;
  62. LC_ALL = 6;
  63. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  64. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  65. CODESET = _NL_CTYPE_CODESET_NAME;
  66. {$else linux}
  67. {$ifdef darwin}
  68. CODESET = 0;
  69. LC_ALL = 0;
  70. {$else darwin}
  71. {$ifdef FreeBSD} // actually FreeBSD5. internationalisation is afaik not default on 4.
  72. __LC_CTYPE = 0;
  73. LC_ALL = 0;
  74. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  75. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  76. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  77. {$else freebsd}
  78. {$ifdef solaris}
  79. CODESET=49;
  80. LC_ALL = 6;
  81. {$else solaris}
  82. {$ifdef beos}
  83. {$warning check correct value for BeOS}
  84. CODESET=49;
  85. {$ifdef haiku}
  86. LC_ALL = 0; // Checked for Haiku
  87. {$else}
  88. LC_ALL = 6; // Checked for BeOS
  89. {$endif}
  90. ESysEILSEQ = EILSEQ;
  91. {$else}
  92. {$ifdef OpenBSD}
  93. CODESET = 51;
  94. LC_ALL = 0;
  95. {$else not OpenBSD}
  96. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  97. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  98. {$endif OpenBSD}
  99. {$endif beos}
  100. {$endif solaris}
  101. {$endif FreeBSD}
  102. {$endif darwin}
  103. {$endif linux}
  104. { unicode encoding name }
  105. {$ifdef FPC_LITTLE_ENDIAN}
  106. unicode_encoding2 = 'UTF-16LE';
  107. unicode_encoding4 = 'UCS-4LE';
  108. {$else FPC_LITTLE_ENDIAN}
  109. unicode_encoding2 = 'UTF-16BE';
  110. unicode_encoding4 = 'UCS-4BE';
  111. {$endif FPC_LITTLE_ENDIAN}
  112. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  113. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  114. { that's a libc macro mapped to internal functions/variables }
  115. { and thus not a stable external API on systems where libc }
  116. { breaks backwards compatibility every now and then }
  117. MB_CUR_MAX = 10;
  118. type
  119. piconv_t = ^iconv_t;
  120. iconv_t = pointer;
  121. nl_item = cint;
  122. {$ifdef haiku}
  123. function nl_langinfo(__item:nl_item):pchar;cdecl;external 'locale' name 'nl_langinfo';
  124. {$else}
  125. {$ifndef beos}
  126. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  127. {$endif}
  128. {$endif}
  129. {$if (not defined(bsd) and not defined(beos)) or (defined(darwin) and not defined(cpupowerpc32))}
  130. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  131. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  132. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  133. {$else}
  134. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  135. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  136. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  137. {$endif}
  138. procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
  139. threadvar
  140. iconv_ansi2wide,
  141. iconv_wide2ansi : iconv_t;
  142. { since we cache the iconv_t converters, we have to do the same
  143. for the DefaultSystemCodePage variable since if it changes, we
  144. have to re-initialize the converters too. We can't do that via
  145. a callback in the widestring manager because DefaultSystemCodePage
  146. is not a threadvar and we can't automatically change this in all
  147. threads }
  148. current_DefaultSystemCodePage: TSystemCodePage;
  149. function win2iconv(cp: word): rawbytestring; forward;
  150. procedure InitThread;
  151. {$if not(defined(darwin) and defined(arm))}
  152. var
  153. iconvname: rawbytestring;
  154. {$endif}
  155. begin
  156. current_DefaultSystemCodePage:=DefaultSystemCodePage;
  157. {$if not(defined(darwin) and defined(arm))}
  158. iconvname:=win2iconv(DefaultSystemCodePage);
  159. iconv_wide2ansi:=iconv_open(pchar(iconvname),unicode_encoding2);
  160. iconv_ansi2wide:=iconv_open(unicode_encoding2,pchar(iconvname));
  161. {$else}
  162. { Unix locale settings are ignored on iPhoneOS }
  163. iconv_wide2ansi:=iconv_open('UTF-8',unicode_encoding2);
  164. iconv_ansi2wide:=iconv_open(unicode_encoding2,'UTF-8');
  165. {$endif}
  166. end;
  167. procedure FiniThread;
  168. begin
  169. if (iconv_wide2ansi <> iconv_t(-1)) then
  170. iconv_close(iconv_wide2ansi);
  171. if (iconv_ansi2wide <> iconv_t(-1)) then
  172. iconv_close(iconv_ansi2wide);
  173. end;
  174. {$i winiconv}
  175. {$if defined(beos) and not defined(haiku)}
  176. function nl_langinfo(__item:nl_item):pchar;
  177. begin
  178. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  179. // Now implement the minimum required to correctly initialize WideString support
  180. case __item of
  181. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  182. else
  183. begin
  184. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  185. Result := '';
  186. end;
  187. end;
  188. end;
  189. {$endif}
  190. procedure Wide2AnsiMove(source:pwidechar; var dest:RawByteString; cp:TSystemCodePage; len:SizeInt);
  191. var
  192. outlength,
  193. outoffset,
  194. srclen,
  195. outleft : size_t;
  196. use_iconv: iconv_t;
  197. srcpos : pwidechar;
  198. destpos: pchar;
  199. mynil : pchar;
  200. my0 : size_t;
  201. err: cint;
  202. free_iconv: boolean;
  203. begin
  204. if (cp=DefaultSystemCodePage) then
  205. begin
  206. { update iconv converter in case the DefaultSystemCodePage has been
  207. changed }
  208. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  209. begin
  210. FiniThread;
  211. InitThread;
  212. end;
  213. use_iconv:=iconv_wide2ansi;
  214. free_iconv:=false;
  215. end
  216. else
  217. begin
  218. { TODO: add caching (then we also don't need separate code for
  219. the default system page and other ones)
  220. -- typecasting an ansistring function result to pchar is
  221. unsafe normally, but these are constant strings -> no
  222. problem }
  223. use_iconv:=iconv_open(pchar(win2iconv(cp)),unicode_encoding2);
  224. free_iconv:=true;
  225. end;
  226. { unsupported encoding -> default move }
  227. if use_iconv=iconv_t(-1) then
  228. begin
  229. DefaultUnicode2AnsiMove(source,dest,DefaultSystemCodePage,len);
  230. exit;
  231. end;
  232. mynil:=nil;
  233. my0:=0;
  234. { rought estimation }
  235. setlength(dest,len*3);
  236. SetCodePage(dest,cp,false);
  237. outlength:=len*3;
  238. srclen:=len*2;
  239. srcpos:=source;
  240. destpos:=pchar(dest);
  241. outleft:=outlength;
  242. while iconv(use_iconv,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  243. begin
  244. err:=fpgetCerrno;
  245. case err of
  246. { last character is incomplete sequence }
  247. ESysEINVAL,
  248. { incomplete sequence in the middle }
  249. ESysEILSEQ:
  250. begin
  251. { skip and set to '?' }
  252. inc(srcpos);
  253. dec(srclen,2);
  254. destpos^:='?';
  255. inc(destpos);
  256. dec(outleft);
  257. { reset }
  258. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  259. if err=ESysEINVAL then
  260. break;
  261. end;
  262. ESysE2BIG:
  263. begin
  264. outoffset:=destpos-pchar(dest);
  265. { extend }
  266. setlength(dest,outlength+len*3);
  267. inc(outleft,len*3);
  268. inc(outlength,len*3);
  269. { string could have been moved }
  270. destpos:=pchar(dest)+outoffset;
  271. end;
  272. else
  273. runerror(231);
  274. end;
  275. end;
  276. // truncate string
  277. setlength(dest,length(dest)-outleft);
  278. if free_iconv then
  279. iconv_close(use_iconv);
  280. end;
  281. procedure Ansi2WideMove(source:pchar; cp:TSystemCodePage; var dest:widestring; len:SizeInt);
  282. var
  283. outlength,
  284. outoffset,
  285. outleft : size_t;
  286. use_iconv: iconv_t;
  287. srcpos,
  288. destpos: pchar;
  289. mynil : pchar;
  290. my0 : size_t;
  291. err: cint;
  292. free_iconv: boolean;
  293. begin
  294. if (cp=DefaultSystemCodePage) then
  295. begin
  296. { update iconv converter in case the DefaultSystemCodePage has been
  297. changed }
  298. if current_DefaultSystemCodePage<>DefaultSystemCodePage then
  299. begin
  300. FiniThread;
  301. InitThread;
  302. end;
  303. use_iconv:=iconv_ansi2wide;
  304. free_iconv:=false;
  305. end
  306. else
  307. begin
  308. { TODO: add caching (then we also don't need separate code for
  309. the default system page and other ones)
  310. -- typecasting an ansistring function result to pchar is
  311. unsafe normally, but these are constant strings -> no
  312. problem }
  313. use_iconv:=iconv_open(unicode_encoding2,pchar(win2iconv(cp)));
  314. free_iconv:=true;
  315. end;
  316. { unsupported encoding -> default move }
  317. if use_iconv=iconv_t(-1) then
  318. begin
  319. DefaultAnsi2UnicodeMove(source,DefaultSystemCodePage,dest,len);
  320. exit;
  321. end;
  322. mynil:=nil;
  323. my0:=0;
  324. // extra space
  325. outlength:=len+1;
  326. setlength(dest,outlength);
  327. srcpos:=source;
  328. destpos:=pchar(dest);
  329. outleft:=outlength*2;
  330. while iconv(use_iconv,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  331. begin
  332. err:=fpgetCerrno;
  333. case err of
  334. ESysEINVAL,
  335. ESysEILSEQ:
  336. begin
  337. { skip and set to '?' }
  338. inc(srcpos);
  339. dec(len);
  340. pwidechar(destpos)^:='?';
  341. inc(destpos,2);
  342. dec(outleft,2);
  343. { reset }
  344. iconv(use_iconv,@mynil,@my0,@mynil,@my0);
  345. if err=ESysEINVAL then
  346. break;
  347. end;
  348. ESysE2BIG:
  349. begin
  350. outoffset:=destpos-pchar(dest);
  351. { extend }
  352. setlength(dest,outlength+len);
  353. inc(outleft,len*2);
  354. inc(outlength,len);
  355. { string could have been moved }
  356. destpos:=pchar(dest)+outoffset;
  357. end;
  358. else
  359. runerror(231);
  360. end;
  361. end;
  362. // truncate string
  363. setlength(dest,length(dest)-outleft div 2);
  364. if free_iconv then
  365. iconv_close(use_iconv);
  366. end;
  367. function LowerWideString(const s : WideString) : WideString;
  368. var
  369. i : SizeInt;
  370. begin
  371. SetLength(result,length(s));
  372. for i:=0 to length(s)-1 do
  373. pwidechar(result)[i]:=WideChar(towlower(wint_t(s[i+1])));
  374. end;
  375. function UpperWideString(const s : WideString) : WideString;
  376. var
  377. i : SizeInt;
  378. begin
  379. SetLength(result,length(s));
  380. for i:=0 to length(s)-1 do
  381. pwidechar(result)[i]:=WideChar(towupper(wint_t(s[i+1])));
  382. end;
  383. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  384. begin
  385. if (len>length(s)) then
  386. if (length(s) < 10*256) then
  387. setlength(s,length(s)+10)
  388. else
  389. setlength(s,length(s)+length(s) shr 8);
  390. end;
  391. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  392. begin
  393. EnsureAnsiLen(s,index);
  394. pchar(@s[index])^:=c;
  395. inc(index);
  396. end;
  397. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  398. {$ifndef beos}
  399. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  400. {$else not beos}
  401. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  402. {$endif beos}
  403. var
  404. p : pchar;
  405. mblen : size_t;
  406. begin
  407. { we know that s is unique -> avoid uniquestring calls}
  408. p:=@s[index];
  409. if (nc<=127) then
  410. ConcatCharToAnsiStr(char(nc),s,index)
  411. else
  412. begin
  413. EnsureAnsiLen(s,index+MB_CUR_MAX);
  414. {$ifndef beos}
  415. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  416. {$else not beos}
  417. mblen:=wctomb(p,wchar_t(nc));
  418. {$endif not beos}
  419. if (mblen<>size_t(-1)) then
  420. inc(index,mblen)
  421. else
  422. begin
  423. { invalid wide char }
  424. p^:='?';
  425. inc(index);
  426. end;
  427. end;
  428. end;
  429. function LowerAnsiString(const s : AnsiString) : AnsiString;
  430. var
  431. i, slen,
  432. resindex : SizeInt;
  433. mblen : size_t;
  434. {$ifndef beos}
  435. ombstate,
  436. nmbstate : mbstate_t;
  437. {$endif beos}
  438. wc : wchar_t;
  439. begin
  440. {$ifndef beos}
  441. fillchar(ombstate,sizeof(ombstate),0);
  442. fillchar(nmbstate,sizeof(nmbstate),0);
  443. {$endif beos}
  444. slen:=length(s);
  445. SetLength(result,slen+10);
  446. i:=1;
  447. resindex:=1;
  448. while (i<=slen) do
  449. begin
  450. if (s[i]<=#127) then
  451. begin
  452. wc:=wchar_t(s[i]);
  453. mblen:= 1;
  454. end
  455. else
  456. {$ifndef beos}
  457. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  458. {$else not beos}
  459. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  460. {$endif not beos}
  461. case mblen of
  462. size_t(-2):
  463. begin
  464. { partial invalid character, copy literally }
  465. while (i<=slen) do
  466. begin
  467. ConcatCharToAnsiStr(s[i],result,resindex);
  468. inc(i);
  469. end;
  470. end;
  471. size_t(-1), 0:
  472. begin
  473. { invalid or null character }
  474. ConcatCharToAnsiStr(s[i],result,resindex);
  475. inc(i);
  476. end;
  477. else
  478. begin
  479. { a valid sequence }
  480. { even if mblen = 1, the lowercase version may have a }
  481. { different length }
  482. { We can't do anything special if wchar_t is 16 bit... }
  483. {$ifndef beos}
  484. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  485. {$else not beos}
  486. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  487. {$endif not beos}
  488. inc(i,mblen);
  489. end;
  490. end;
  491. end;
  492. SetLength(result,resindex-1);
  493. end;
  494. function UpperAnsiString(const s : AnsiString) : AnsiString;
  495. var
  496. i, slen,
  497. resindex : SizeInt;
  498. mblen : size_t;
  499. {$ifndef beos}
  500. ombstate,
  501. nmbstate : mbstate_t;
  502. {$endif beos}
  503. wc : wchar_t;
  504. begin
  505. {$ifndef beos}
  506. fillchar(ombstate,sizeof(ombstate),0);
  507. fillchar(nmbstate,sizeof(nmbstate),0);
  508. {$endif beos}
  509. slen:=length(s);
  510. SetLength(result,slen+10);
  511. i:=1;
  512. resindex:=1;
  513. while (i<=slen) do
  514. begin
  515. if (s[i]<=#127) then
  516. begin
  517. wc:=wchar_t(s[i]);
  518. mblen:= 1;
  519. end
  520. else
  521. {$ifndef beos}
  522. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  523. {$else not beos}
  524. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  525. {$endif beos}
  526. case mblen of
  527. size_t(-2):
  528. begin
  529. { partial invalid character, copy literally }
  530. while (i<=slen) do
  531. begin
  532. ConcatCharToAnsiStr(s[i],result,resindex);
  533. inc(i);
  534. end;
  535. end;
  536. size_t(-1), 0:
  537. begin
  538. { invalid or null character }
  539. ConcatCharToAnsiStr(s[i],result,resindex);
  540. inc(i);
  541. end;
  542. else
  543. begin
  544. { a valid sequence }
  545. { even if mblen = 1, the uppercase version may have a }
  546. { different length }
  547. { We can't do anything special if wchar_t is 16 bit... }
  548. {$ifndef beos}
  549. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  550. {$else not beos}
  551. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  552. {$endif not beos}
  553. inc(i,mblen);
  554. end;
  555. end;
  556. end;
  557. SetLength(result,resindex-1);
  558. end;
  559. function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char; external name 'FPC_UTF16TOUTF32';
  560. function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String;
  561. var
  562. i, slen,
  563. destindex : SizeInt;
  564. len : longint;
  565. uch : UCS4Char;
  566. begin
  567. slen:=length(s);
  568. setlength(result,slen+1);
  569. i:=1;
  570. destindex:=0;
  571. while (i<=slen) do
  572. begin
  573. uch:=utf16toutf32(s,i,len);
  574. if (uch=UCS4Char(0)) then
  575. uch:=UCS4Char(32);
  576. result[destindex]:=uch;
  577. inc(destindex);
  578. inc(i,len);
  579. end;
  580. result[destindex]:=UCS4Char(0);
  581. { destindex <= slen }
  582. setlength(result,destindex+1);
  583. end;
  584. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  585. var
  586. hs1,hs2 : UCS4String;
  587. begin
  588. { wcscoll interprets null chars as end-of-string -> filter out }
  589. hs1:=WideStringToUCS4StringNoNulls(s1);
  590. hs2:=WideStringToUCS4StringNoNulls(s2);
  591. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  592. end;
  593. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  594. begin
  595. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  596. end;
  597. function CharLengthPChar(const Str: PChar): PtrInt;
  598. var
  599. nextlen: ptrint;
  600. s: pchar;
  601. {$ifndef beos}
  602. mbstate: mbstate_t;
  603. {$endif not beos}
  604. begin
  605. result:=0;
  606. s:=str;
  607. {$ifndef beos}
  608. fillchar(mbstate,sizeof(mbstate),0);
  609. {$endif not beos}
  610. repeat
  611. {$ifdef beos}
  612. nextlen:=ptrint(mblen(str,MB_CUR_MAX));
  613. {$else beos}
  614. nextlen:=ptrint(mbrlen(str,MB_CUR_MAX,@mbstate));
  615. {$endif beos}
  616. { skip invalid/incomplete sequences }
  617. if (nextlen<0) then
  618. nextlen:=1;
  619. inc(result,nextlen);
  620. inc(s,nextlen);
  621. until (nextlen=0);
  622. end;
  623. function CodePointLength(const Str: PChar; maxlookahead: ptrint): PtrInt;
  624. var
  625. nextlen: ptrint;
  626. {$ifndef beos}
  627. mbstate: mbstate_t;
  628. {$endif not beos}
  629. begin
  630. {$ifdef beos}
  631. result:=ptrint(mblen(str,maxlookahead));
  632. {$else beos}
  633. fillchar(mbstate,sizeof(mbstate),0);
  634. result:=ptrint(mbrlen(str,maxlookahead,@mbstate));
  635. { mbrlen can also return -2 for "incomplete but potially valid character
  636. and data has been processed" }
  637. if result<0 then
  638. result:=-1;
  639. {$endif beos}
  640. end;
  641. function StrCompAnsiIntern(s1,s2 : PChar; len1, len2: PtrInt; canmodifys1, canmodifys2: boolean): PtrInt;
  642. var
  643. a,b: pchar;
  644. i: PtrInt;
  645. begin
  646. if not(canmodifys1) then
  647. getmem(a,len1+1)
  648. else
  649. a:=s1;
  650. for i:=0 to len1-1 do
  651. if s1[i]<>#0 then
  652. a[i]:=s1[i]
  653. else
  654. a[i]:=#32;
  655. a[len1]:=#0;
  656. if not(canmodifys2) then
  657. getmem(b,len2+1)
  658. else
  659. b:=s2;
  660. for i:=0 to len2-1 do
  661. if s2[i]<>#0 then
  662. b[i]:=s2[i]
  663. else
  664. b[i]:=#32;
  665. b[len2]:=#0;
  666. result:=strcoll(a,b);
  667. if not(canmodifys1) then
  668. freemem(a);
  669. if not(canmodifys2) then
  670. freemem(b);
  671. end;
  672. function CompareStrAnsiString(const s1, s2: ansistring): PtrInt;
  673. begin
  674. result:=StrCompAnsiIntern(pchar(s1),pchar(s2),length(s1),length(s2),false,false);
  675. end;
  676. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  677. begin
  678. result:=strcoll(s1,s2);
  679. end;
  680. function AnsiCompareText(const S1, S2: ansistring): PtrInt;
  681. var
  682. a, b: AnsiString;
  683. begin
  684. a:=UpperAnsistring(s1);
  685. b:=UpperAnsistring(s2);
  686. result:=StrCompAnsiIntern(pchar(a),pchar(b),length(a),length(b),true,true);
  687. end;
  688. function AnsiStrIComp(S1, S2: PChar): PtrInt;
  689. begin
  690. result:=AnsiCompareText(ansistring(s1),ansistring(s2));
  691. end;
  692. function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  693. var
  694. a, b: pchar;
  695. begin
  696. if (maxlen=0) then
  697. exit(0);
  698. if (s1[maxlen]<>#0) then
  699. begin
  700. getmem(a,maxlen+1);
  701. move(s1^,a^,maxlen);
  702. a[maxlen]:=#0;
  703. end
  704. else
  705. a:=s1;
  706. if (s2[maxlen]<>#0) then
  707. begin
  708. getmem(b,maxlen+1);
  709. move(s2^,b^,maxlen);
  710. b[maxlen]:=#0;
  711. end
  712. else
  713. b:=s2;
  714. result:=StrCompAnsiIntern(a,b,maxlen,maxlen,a<>s1,b<>s2);
  715. if (a<>s1) then
  716. freemem(a);
  717. if (b<>s2) then
  718. freemem(b);
  719. end;
  720. function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
  721. var
  722. a, b: ansistring;
  723. begin
  724. if (maxlen=0) then
  725. exit(0);
  726. setlength(a,maxlen);
  727. move(s1^,a[1],maxlen);
  728. setlength(b,maxlen);
  729. move(s2^,b[1],maxlen);
  730. result:=AnsiCompareText(a,b);
  731. end;
  732. procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
  733. var
  734. newlen: sizeint;
  735. begin
  736. newlen:=length(s);
  737. if newlen>strlen(orgp) then
  738. fpc_rangeerror;
  739. p:=orgp;
  740. if (newlen>0) then
  741. move(s[1],p[0],newlen);
  742. p[newlen]:=#0;
  743. end;
  744. function AnsiStrLower(Str: PChar): PChar;
  745. var
  746. temp: ansistring;
  747. begin
  748. temp:=loweransistring(str);
  749. ansi2pchar(temp,str,result);
  750. end;
  751. function AnsiStrUpper(Str: PChar): PChar;
  752. var
  753. temp: ansistring;
  754. begin
  755. temp:=upperansistring(str);
  756. ansi2pchar(temp,str,result);
  757. end;
  758. Procedure SetCWideStringManager;
  759. Var
  760. CWideStringManager : TUnicodeStringManager;
  761. begin
  762. CWideStringManager:=widestringmanager;
  763. With CWideStringManager do
  764. begin
  765. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  766. Ansi2WideMoveProc:=@Ansi2WideMove;
  767. UpperWideStringProc:=@UpperWideString;
  768. LowerWideStringProc:=@LowerWideString;
  769. CompareWideStringProc:=@CompareWideString;
  770. CompareTextWideStringProc:=@CompareTextWideString;
  771. CharLengthPCharProc:=@CharLengthPChar;
  772. CodePointLengthProc:=@CodePointLength;
  773. UpperAnsiStringProc:=@UpperAnsiString;
  774. LowerAnsiStringProc:=@LowerAnsiString;
  775. CompareStrAnsiStringProc:=@CompareStrAnsiString;
  776. CompareTextAnsiStringProc:=@AnsiCompareText;
  777. StrCompAnsiStringProc:=@StrCompAnsi;
  778. StrICompAnsiStringProc:=@AnsiStrIComp;
  779. StrLCompAnsiStringProc:=@AnsiStrLComp;
  780. StrLICompAnsiStringProc:=@AnsiStrLIComp;
  781. StrLowerAnsiStringProc:=@AnsiStrLower;
  782. StrUpperAnsiStringProc:=@AnsiStrUpper;
  783. ThreadInitProc:=@InitThread;
  784. ThreadFiniProc:=@FiniThread;
  785. { Unicode }
  786. Unicode2AnsiMoveProc:=@Wide2AnsiMove;
  787. Ansi2UnicodeMoveProc:=@Ansi2WideMove;
  788. UpperUnicodeStringProc:=@UpperWideString;
  789. LowerUnicodeStringProc:=@LowerWideString;
  790. CompareUnicodeStringProc:=@CompareWideString;
  791. CompareTextUnicodeStringProc:=@CompareTextWideString;
  792. end;
  793. SetUnicodeStringManager(CWideStringManager);
  794. end;
  795. initialization
  796. SetCWideStringManager;
  797. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  798. { with the information from the environment variables according to POSIX }
  799. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  800. setlocale(LC_ALL,'');
  801. { set the DefaultSystemCodePage }
  802. DefaultSystemCodePage:=iconv2win(ansistring(nl_langinfo(CODESET)));
  803. { init conversion tables for main program }
  804. InitThread;
  805. finalization
  806. { fini conversion tables for main program }
  807. FiniThread;
  808. end.