cwstring.pp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2005 by Florian Klaempfl,
  4. member of the Free Pascal development team.
  5. libc based wide string support
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$inline on}
  14. unit cwstring;
  15. interface
  16. procedure SetCWidestringManager;
  17. implementation
  18. {$linklib c}
  19. {$if not defined(linux) and not defined(solaris)} // Linux (and maybe glibc platforms in general), have iconv in glibc.
  20. {$linklib iconv}
  21. {$define useiconv}
  22. {$endif linux}
  23. Uses
  24. BaseUnix,
  25. ctypes,
  26. unix,
  27. unixtype,
  28. initc;
  29. Const
  30. {$ifndef useiconv}
  31. libiconvname='c'; // is in libc under Linux.
  32. {$else}
  33. libiconvname='iconv';
  34. {$endif}
  35. { helper functions from libc }
  36. function tolower(__wc:cint):cint;cdecl;external libiconvname name 'tolower';
  37. function toupper(__wc:cint):cint;cdecl;external libiconvname name 'toupper';
  38. function towlower(__wc:wint_t):wint_t;cdecl;external libiconvname name 'towlower';
  39. function towupper(__wc:wint_t):wint_t;cdecl;external libiconvname name 'towupper';
  40. function wcscoll (__s1:pwchar_t; __s2:pwchar_t):cint;cdecl;external libiconvname name 'wcscoll';
  41. function strcoll (__s1:pchar; __s2:pchar):cint;cdecl;external libiconvname name 'strcoll';
  42. function setlocale(category: cint; locale: pchar): pchar; cdecl; external clib name 'setlocale';
  43. {$ifndef beos}
  44. function mbrtowc(pwc: pwchar_t; const s: pchar; n: size_t; ps: pmbstate_t): size_t; cdecl; external clib name 'mbrtowc';
  45. function wcrtomb(s: pchar; wc: wchar_t; ps: pmbstate_t): size_t; cdecl; external clib name 'wcrtomb';
  46. {$else beos}
  47. function mbtowc(pwc: pwchar_t; const s: pchar; n: size_t): size_t; cdecl; external clib name 'mbtowc';
  48. function wctomb(s: pchar; wc: wchar_t): size_t; cdecl; external clib name 'wctomb';
  49. {$endif beos}
  50. const
  51. {$ifdef linux}
  52. __LC_CTYPE = 0;
  53. LC_ALL = 6;
  54. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  55. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  56. CODESET = _NL_CTYPE_CODESET_NAME;
  57. {$else linux}
  58. {$ifdef darwin}
  59. CODESET = 0;
  60. LC_ALL = 0;
  61. {$else darwin}
  62. {$ifdef FreeBSD} // actually FreeBSD5. internationalisation is afaik not default on 4.
  63. __LC_CTYPE = 0;
  64. LC_ALL = 0;
  65. _NL_CTYPE_CLASS = (__LC_CTYPE shl 16);
  66. _NL_CTYPE_CODESET_NAME = (_NL_CTYPE_CLASS)+14;
  67. CODESET = 0; // _NL_CTYPE_CODESET_NAME;
  68. {$else freebsd}
  69. {$ifdef solaris}
  70. CODESET=49;
  71. LC_ALL = 6;
  72. {$else solaris}
  73. {$ifdef beos}
  74. {$warning check correct value for BeOS}
  75. CODESET=49;
  76. LC_ALL = 6; // Checked for BeOS, but 0 under Haiku...
  77. ESysEILSEQ = EILSEQ;
  78. {$else}
  79. {$error lookup the value of CODESET in /usr/include/langinfo.h, and the value of LC_ALL in /usr/include/locale.h for your OS }
  80. // and while doing it, check if iconv is in libc, and if the symbols are prefixed with iconv_ or libiconv_
  81. {$endif beos}
  82. {$endif solaris}
  83. {$endif FreeBSD}
  84. {$endif darwin}
  85. {$endif linux}
  86. { unicode encoding name }
  87. {$ifdef FPC_LITTLE_ENDIAN}
  88. unicode_encoding2 = 'UTF-16LE';
  89. unicode_encoding4 = 'UCS-4LE';
  90. {$else FPC_LITTLE_ENDIAN}
  91. unicode_encoding2 = 'UTF-16BE';
  92. unicode_encoding4 = 'UCS-4BE';
  93. {$endif FPC_LITTLE_ENDIAN}
  94. type
  95. piconv_t = ^iconv_t;
  96. iconv_t = pointer;
  97. nl_item = cint;
  98. {$ifndef beos}
  99. function nl_langinfo(__item:nl_item):pchar;cdecl;external libiconvname name 'nl_langinfo';
  100. {$endif}
  101. {$if (not defined(bsd) and not defined(beos)) or defined(darwin)}
  102. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'iconv_open';
  103. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'iconv';
  104. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'iconv_close';
  105. {$else}
  106. function iconv_open(__tocode:pchar; __fromcode:pchar):iconv_t;cdecl;external libiconvname name 'libiconv_open';
  107. function iconv(__cd:iconv_t; __inbuf:ppchar; __inbytesleft:psize_t; __outbuf:ppchar; __outbytesleft:psize_t):size_t;cdecl;external libiconvname name 'libiconv';
  108. function iconv_close(__cd:iconv_t):cint;cdecl;external libiconvname name 'libiconv_close';
  109. {$endif}
  110. threadvar
  111. iconv_ansi2ucs4,
  112. iconv_ucs42ansi,
  113. iconv_ansi2wide,
  114. iconv_wide2ansi : iconv_t;
  115. {$ifdef beos}
  116. function nl_langinfo(__item:nl_item):pchar;
  117. begin
  118. {$warning TODO BeOS nl_langinfo or more uptodate port of iconv...}
  119. // Now implement the minimum required to correctly initialize WideString support
  120. case __item of
  121. CODESET : Result := 'UTF-8'; // BeOS use UTF-8
  122. else
  123. begin
  124. Assert(False, 'nl_langinfo was called with an unknown nl_item value');
  125. Result := '';
  126. end;
  127. end;
  128. end;
  129. {$endif}
  130. procedure Wide2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt);
  131. var
  132. outlength,
  133. outoffset,
  134. srclen,
  135. outleft : size_t;
  136. srcpos : pwidechar;
  137. destpos: pchar;
  138. mynil : pchar;
  139. my0 : size_t;
  140. err: cint;
  141. begin
  142. mynil:=nil;
  143. my0:=0;
  144. { rought estimation }
  145. setlength(dest,len*3);
  146. outlength:=len*3;
  147. srclen:=len*2;
  148. srcpos:=source;
  149. destpos:=pchar(dest);
  150. outleft:=outlength;
  151. while iconv(iconv_wide2ansi,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do
  152. begin
  153. err:=fpgetCerrno;
  154. case err of
  155. { last character is incomplete sequence }
  156. ESysEINVAL,
  157. { incomplete sequence in the middle }
  158. ESysEILSEQ:
  159. begin
  160. { skip and set to '?' }
  161. inc(srcpos);
  162. dec(srclen,2);
  163. destpos^:='?';
  164. inc(destpos);
  165. dec(outleft);
  166. { reset }
  167. iconv(iconv_wide2ansi,@mynil,@my0,@mynil,@my0);
  168. if err=ESysEINVAL then
  169. break;
  170. end;
  171. ESysE2BIG:
  172. begin
  173. outoffset:=destpos-pchar(dest);
  174. { extend }
  175. setlength(dest,outlength+len*3);
  176. inc(outleft,len*3);
  177. inc(outlength,len*3);
  178. { string could have been moved }
  179. destpos:=pchar(dest)+outoffset;
  180. end;
  181. else
  182. runerror(231);
  183. end;
  184. end;
  185. // truncate string
  186. setlength(dest,length(dest)-outleft);
  187. end;
  188. procedure Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt);
  189. var
  190. outlength,
  191. outoffset,
  192. outleft : size_t;
  193. srcpos,
  194. destpos: pchar;
  195. mynil : pchar;
  196. my0 : size_t;
  197. err: cint;
  198. begin
  199. mynil:=nil;
  200. my0:=0;
  201. // extra space
  202. outlength:=len+1;
  203. setlength(dest,outlength);
  204. srcpos:=source;
  205. destpos:=pchar(dest);
  206. outleft:=outlength*2;
  207. while iconv(iconv_ansi2wide,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  208. begin
  209. err:=fpgetCerrno;
  210. case err of
  211. ESysEINVAL,
  212. ESysEILSEQ:
  213. begin
  214. { skip and set to '?' }
  215. inc(srcpos);
  216. dec(len);
  217. pwidechar(destpos)^:='?';
  218. inc(destpos,2);
  219. dec(outleft,2);
  220. { reset }
  221. iconv(iconv_ansi2wide,@mynil,@my0,@mynil,@my0);
  222. if err=ESysEINVAL then
  223. break;
  224. end;
  225. ESysE2BIG:
  226. begin
  227. outoffset:=destpos-pchar(dest);
  228. { extend }
  229. setlength(dest,outlength+len);
  230. inc(outleft,len*2);
  231. inc(outlength,len);
  232. { string could have been moved }
  233. destpos:=pchar(dest)+outoffset;
  234. end;
  235. else
  236. runerror(231);
  237. end;
  238. end;
  239. // truncate string
  240. setlength(dest,length(dest)-outleft div 2);
  241. end;
  242. function LowerWideString(const s : WideString) : WideString;
  243. var
  244. i : SizeInt;
  245. begin
  246. SetLength(result,length(s));
  247. for i:=1 to length(s) do
  248. result[i]:=WideChar(towlower(wint_t(s[i])));
  249. end;
  250. function UpperWideString(const s : WideString) : WideString;
  251. var
  252. i : SizeInt;
  253. begin
  254. SetLength(result,length(s));
  255. for i:=1 to length(s) do
  256. result[i]:=WideChar(towupper(wint_t(s[i])));
  257. end;
  258. procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
  259. begin
  260. if (len>length(s)) then
  261. if (length(s) < 10*256) then
  262. setlength(s,length(s)+10)
  263. else
  264. setlength(s,length(s)+length(s) shr 8);
  265. end;
  266. procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
  267. begin
  268. EnsureAnsiLen(s,index);
  269. pchar(@s[index])^:=c;
  270. inc(index);
  271. end;
  272. { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
  273. {$ifndef beos}
  274. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt; var mbstate: mbstate_t);
  275. {$else not beos}
  276. procedure ConcatUTF32ToAnsiStr(const nc: wint_t; var S: AnsiString; var index: SizeInt);
  277. {$endif beos}
  278. var
  279. p : pchar;
  280. mblen : size_t;
  281. begin
  282. { we know that s is unique -> avoid uniquestring calls}
  283. p:=@s[index];
  284. if (nc<=127) then
  285. ConcatCharToAnsiStr(char(nc),s,index)
  286. else
  287. begin
  288. { en_US.UTF-8 needs maximally 6 chars, UCS-4/UTF-32 needs 4 }
  289. { -> 10 should be enough? Should actually use MB_CUR_MAX, but }
  290. { that's a libc macro mapped to internal functions/variables }
  291. { and thus not a stable external API on systems where libc }
  292. { breaks backwards compatibility every now and then }
  293. EnsureAnsiLen(s,index+10);
  294. {$ifndef beos}
  295. mblen:=wcrtomb(p,wchar_t(nc),@mbstate);
  296. {$else not beos}
  297. mblen:=wctomb(p,wchar_t(nc));
  298. {$endif not beos}
  299. if (mblen<>size_t(-1)) then
  300. inc(index,mblen)
  301. else
  302. begin
  303. { invalid wide char }
  304. p^:='?';
  305. inc(index);
  306. end;
  307. end;
  308. end;
  309. function LowerAnsiString(const s : AnsiString) : AnsiString;
  310. var
  311. i, slen,
  312. resindex : SizeInt;
  313. mblen : size_t;
  314. {$ifndef beos}
  315. ombstate,
  316. nmbstate : mbstate_t;
  317. {$endif beos}
  318. wc : wchar_t;
  319. begin
  320. {$ifndef beos}
  321. fillchar(ombstate,sizeof(ombstate),0);
  322. fillchar(nmbstate,sizeof(nmbstate),0);
  323. {$endif beos}
  324. slen:=length(s);
  325. SetLength(result,slen+10);
  326. i:=1;
  327. resindex:=1;
  328. while (i<=slen) do
  329. begin
  330. if (s[i]<=#127) then
  331. begin
  332. ConcatCharToAnsiStr(char(tolower(cint(s[i]))),result,resindex);
  333. inc(i)
  334. end
  335. else
  336. begin
  337. {$ifndef beos}
  338. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  339. {$else not beos}
  340. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  341. {$endif not beos}
  342. case mblen of
  343. size_t(-2):
  344. begin
  345. { partial invalid character, copy literally }
  346. while (i<=slen) do
  347. begin
  348. ConcatCharToAnsiStr(s[i],result,resindex);
  349. inc(i);
  350. end;
  351. end;
  352. size_t(-1), 0:
  353. begin
  354. { invalid or null character }
  355. ConcatCharToAnsiStr(s[i],result,resindex);
  356. inc(i);
  357. end;
  358. else
  359. begin
  360. { a valid sequence }
  361. { even if mblen = 1, the lowercase version may have a }
  362. { different length }
  363. { We can't do anything special if wchar_t is 16 bit... }
  364. {$ifndef beos}
  365. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex,nmbstate);
  366. {$else not beos}
  367. ConcatUTF32ToAnsiStr(towlower(wint_t(wc)),result,resindex);
  368. {$endif not beos}
  369. inc(i,mblen);
  370. end;
  371. end;
  372. end;
  373. end;
  374. SetLength(result,resindex-1);
  375. end;
  376. function UpperAnsiString(const s : AnsiString) : AnsiString;
  377. var
  378. i, slen,
  379. resindex : SizeInt;
  380. mblen : size_t;
  381. {$ifndef beos}
  382. ombstate,
  383. nmbstate : mbstate_t;
  384. {$endif beos}
  385. wc : wchar_t;
  386. begin
  387. {$ifndef beos}
  388. fillchar(ombstate,sizeof(ombstate),0);
  389. fillchar(nmbstate,sizeof(nmbstate),0);
  390. {$endif beos}
  391. slen:=length(s);
  392. SetLength(result,slen+10);
  393. i:=1;
  394. resindex:=1;
  395. while (i<=slen) do
  396. begin
  397. if (s[i]<=#127) then
  398. begin
  399. ConcatCharToAnsiStr(char(toupper(cint(s[i]))),result,resindex);
  400. inc(i)
  401. end
  402. else
  403. begin
  404. {$ifndef beos}
  405. mblen:=mbrtowc(@wc, pchar(@s[i]), slen-i+1, @ombstate);
  406. {$else not beos}
  407. mblen:=mbtowc(@wc, pchar(@s[i]), slen-i+1);
  408. {$endif beos}
  409. case mblen of
  410. size_t(-2):
  411. begin
  412. { partial invalid character, copy literally }
  413. while (i<=slen) do
  414. begin
  415. ConcatCharToAnsiStr(s[i],result,resindex);
  416. inc(i);
  417. end;
  418. end;
  419. size_t(-1), 0:
  420. begin
  421. { invalid or null character }
  422. ConcatCharToAnsiStr(s[i],result,resindex);
  423. inc(i);
  424. end;
  425. else
  426. begin
  427. { a valid sequence }
  428. { even if mblen = 1, the uppercase version may have a }
  429. { different length }
  430. { We can't do anything special if wchar_t is 16 bit... }
  431. {$ifndef beos}
  432. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex,nmbstate);
  433. {$else not beos}
  434. ConcatUTF32ToAnsiStr(towupper(wint_t(wc)),result,resindex);
  435. {$endif not beos}
  436. inc(i,mblen);
  437. end;
  438. end;
  439. end;
  440. end;
  441. SetLength(result,resindex-1);
  442. end;
  443. procedure Ansi2UCS4Move(source:pchar;var dest:UCS4String;len:SizeInt);
  444. var
  445. outlength,
  446. outoffset,
  447. outleft : size_t;
  448. err: cint;
  449. srcpos,
  450. destpos: pchar;
  451. mynil : pchar;
  452. my0 : size_t;
  453. begin
  454. mynil:=nil;
  455. my0:=0;
  456. // extra space
  457. outlength:=len+1;
  458. setlength(dest,outlength);
  459. outlength:=len+1;
  460. srcpos:=source;
  461. destpos:=pchar(dest);
  462. outleft:=outlength*4;
  463. while iconv(iconv_ansi2ucs4,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do
  464. begin
  465. err:=fpgetCerrno;
  466. case err of
  467. ESysEINVAL,
  468. ESysEILSEQ:
  469. begin
  470. { skip and set to '?' }
  471. inc(srcpos);
  472. dec(len);
  473. plongint(destpos)^:=longint('?');
  474. inc(destpos,4);
  475. dec(outleft,4);
  476. { reset }
  477. iconv(iconv_ansi2ucs4,@mynil,@my0,@mynil,@my0);
  478. if err=ESysEINVAL then
  479. break;
  480. end;
  481. ESysE2BIG:
  482. begin
  483. outoffset:=destpos-pchar(dest);
  484. { extend }
  485. setlength(dest,outlength+len);
  486. inc(outleft,len*4);
  487. inc(outlength,len);
  488. { string could have been moved }
  489. destpos:=pchar(dest)+outoffset;
  490. end;
  491. else
  492. runerror(231);
  493. end;
  494. end;
  495. // truncate string
  496. setlength(dest,length(dest)-outleft div 4);
  497. end;
  498. function CompareWideString(const s1, s2 : WideString) : PtrInt;
  499. var
  500. hs1,hs2 : UCS4String;
  501. begin
  502. hs1:=WideStringToUCS4String(s1);
  503. hs2:=WideStringToUCS4String(s2);
  504. result:=wcscoll(pwchar_t(hs1),pwchar_t(hs2));
  505. end;
  506. function CompareTextWideString(const s1, s2 : WideString): PtrInt;
  507. begin
  508. result:=CompareWideString(UpperWideString(s1),UpperWideString(s2));
  509. end;
  510. function StrCompAnsi(s1,s2 : PChar): PtrInt;
  511. begin
  512. result:=strcoll(s1,s2);
  513. end;
  514. procedure InitThread;
  515. begin
  516. iconv_wide2ansi:=iconv_open(nl_langinfo(CODESET),unicode_encoding2);
  517. iconv_ansi2wide:=iconv_open(unicode_encoding2,nl_langinfo(CODESET));
  518. iconv_ucs42ansi:=iconv_open(nl_langinfo(CODESET),unicode_encoding4);
  519. iconv_ansi2ucs4:=iconv_open(unicode_encoding4,nl_langinfo(CODESET));
  520. end;
  521. procedure FiniThread;
  522. begin
  523. if (iconv_wide2ansi <> iconv_t(-1)) then
  524. iconv_close(iconv_wide2ansi);
  525. if (iconv_ansi2wide <> iconv_t(-1)) then
  526. iconv_close(iconv_ansi2wide);
  527. if (iconv_ucs42ansi <> iconv_t(-1)) then
  528. iconv_close(iconv_ucs42ansi);
  529. if (iconv_ansi2ucs4 <> iconv_t(-1)) then
  530. iconv_close(iconv_ansi2ucs4);
  531. end;
  532. Procedure SetCWideStringManager;
  533. Var
  534. CWideStringManager : TWideStringManager;
  535. begin
  536. CWideStringManager:=widestringmanager;
  537. With CWideStringManager do
  538. begin
  539. Wide2AnsiMoveProc:=@Wide2AnsiMove;
  540. Ansi2WideMoveProc:=@Ansi2WideMove;
  541. UpperWideStringProc:=@UpperWideString;
  542. LowerWideStringProc:=@LowerWideString;
  543. CompareWideStringProc:=@CompareWideString;
  544. CompareTextWideStringProc:=@CompareTextWideString;
  545. {
  546. CharLengthPCharProc
  547. }
  548. UpperAnsiStringProc:=@UpperAnsiString;
  549. LowerAnsiStringProc:=@LowerAnsiString;
  550. {
  551. CompareStrAnsiStringProc
  552. CompareTextAnsiStringProc
  553. }
  554. StrCompAnsiStringProc:=@StrCompAnsi;
  555. {
  556. StrICompAnsiStringProc
  557. StrLCompAnsiStringProc
  558. StrLICompAnsiStringProc
  559. StrLowerAnsiStringProc
  560. StrUpperAnsiStringProc
  561. }
  562. ThreadInitProc:=@InitThread;
  563. ThreadFiniProc:=@FiniThread;
  564. end;
  565. SetWideStringManager(CWideStringManager);
  566. end;
  567. initialization
  568. SetCWideStringManager;
  569. { you have to call setlocale(LC_ALL,'') to initialise the langinfo stuff }
  570. { with the information from the environment variables according to POSIX }
  571. { (some OSes do this automatically, but e.g. Darwin and Solaris don't) }
  572. setlocale(LC_ALL,'');
  573. { init conversion tables for main program }
  574. InitThread;
  575. finalization
  576. { fini conversion tables for main program }
  577. FiniThread;
  578. end.