widestr.pas 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. {
  2. Copyright (c) 2000-2002 by Florian Klaempfl
  3. This unit contains basic functions for unicode support in the
  4. compiler, this unit is mainly necessary to bootstrap widestring
  5. support ...
  6. This program is free software; you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation; either version 2 of the License, or
  9. (at your option) any later version.
  10. This program is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License
  15. along with this program; if not, write to the Free Software
  16. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17. ****************************************************************************
  18. }
  19. unit widestr;
  20. {$i fpcdefs.inc}
  21. interface
  22. uses
  23. charset,globtype;
  24. type
  25. tcompilerwidechar = word;
  26. tcompilerwidecharptr = ^tcompilerwidechar;
  27. pcompilerwidechar = ^tcompilerwidechar;
  28. tcompilerwidechararray = array of tcompilerwidechar;
  29. tcompilerwidestring = class
  30. data : tcompilerwidechararray;
  31. maxlen,len : SizeInt;
  32. function asconstpunicodechar : PUnicodeChar;
  33. end;
  34. procedure initwidestring(out r : tcompilerwidestring);
  35. procedure donewidestring(var r : tcompilerwidestring);
  36. procedure setlengthwidestring(r : tcompilerwidestring;l : SizeInt);
  37. function getlengthwidestring(r : tcompilerwidestring) : SizeInt;
  38. procedure concatwidestringchar(r : tcompilerwidestring;c : tcompilerwidechar);
  39. procedure concatwidestrings(s1,s2 : tcompilerwidestring);
  40. function comparewidestrings(s1,s2 : tcompilerwidestring) : SizeInt;
  41. procedure copywidestring(s,d : tcompilerwidestring);
  42. function asciichar2unicode(c : char) : tcompilerwidechar;
  43. function unicode2asciichar(c : tcompilerwidechar) : char;
  44. procedure ascii2unicode(p : pchar;l : SizeInt;cp : tstringencoding;r : tcompilerwidestring;codepagetranslation : boolean = true);
  45. procedure unicode2ascii(r : tcompilerwidestring;p : pchar;cp : tstringencoding);
  46. procedure unicode2ascii(r : tcompilerwidestring;arr:TAnsiCharDynArray;cp : tstringencoding);
  47. function hasnonasciichars(const p: tcompilerwidestring): boolean;
  48. function getcharwidestring(r : tcompilerwidestring;l : SizeInt) : tcompilerwidechar;
  49. function cpavailable(const s: string) : boolean;
  50. function cpavailable(cp: word) : boolean;
  51. procedure changecodepage(
  52. s : pchar; l : SizeInt; scp : tstringencoding;
  53. d : pchar; dcp : tstringencoding
  54. );
  55. function codepagebyname(const s : string) : tstringencoding;
  56. function charlength(p: pchar; len: sizeint): sizeint;
  57. function charlength(const s: string): sizeint;
  58. implementation
  59. uses
  60. { use only small codepage maps, others will be }
  61. { loaded on demand from -FM path }
  62. { cyrillic code pages }
  63. cp1251,cp866,cp8859_5,
  64. { greek code page }
  65. cp1253,
  66. { other code pages }
  67. cp8859_1,cp850,cp437,cp1252,cp646,
  68. cp874, cp856,cp852,cp8859_2,
  69. cp1250,cp1254,cp1255,cp1256,cp1257,cp1258,
  70. globals,cutils;
  71. procedure initwidestring(out r : tcompilerwidestring);
  72. begin
  73. R:=tcompilerwidestring.create;
  74. r.data:=nil;
  75. r.len:=0;
  76. r.maxlen:=0;
  77. end;
  78. procedure donewidestring(var r : tcompilerwidestring);
  79. begin
  80. r.Free;
  81. r:=nil;
  82. end;
  83. function getcharwidestring(r : tcompilerwidestring;l : SizeInt) : tcompilerwidechar;
  84. begin
  85. getcharwidestring:=r.data[l];
  86. end;
  87. function getlengthwidestring(r : tcompilerwidestring) : SizeInt;
  88. begin
  89. getlengthwidestring:=r.len;
  90. end;
  91. procedure growwidestring(r : tcompilerwidestring;l : SizeInt);
  92. begin
  93. if r.maxlen>=l then
  94. exit;
  95. setlength(r.data,l);
  96. r.maxlen:=l;
  97. end;
  98. procedure setlengthwidestring(r : tcompilerwidestring;l : SizeInt);
  99. begin
  100. r.len:=l;
  101. if l>r.maxlen then
  102. growwidestring(r,l);
  103. end;
  104. procedure concatwidestringchar(r : tcompilerwidestring;c : tcompilerwidechar);
  105. begin
  106. if r.len>=r.maxlen then
  107. growwidestring(r,r.len+16);
  108. r.data[r.len]:=c;
  109. inc(r.len);
  110. end;
  111. procedure concatwidestrings(s1,s2 : tcompilerwidestring);
  112. begin
  113. growwidestring(s1,s1.len+s2.len);
  114. if s2.len>0 then
  115. move(s2.data[0],s1.data[s1.len],s2.len*sizeof(tcompilerwidechar));
  116. inc(s1.len,s2.len);
  117. end;
  118. procedure copywidestring(s,d : tcompilerwidestring);
  119. begin
  120. setlengthwidestring(d,s.len);
  121. if s.len>0 then
  122. move(s.data[0],d.data[0],s.len*sizeof(tcompilerwidechar));
  123. end;
  124. function comparewidestrings(s1,s2 : tcompilerwidestring) : SizeInt;
  125. var
  126. maxi,temp : SizeInt;
  127. begin
  128. if pointer(s1)=pointer(s2) then
  129. begin
  130. comparewidestrings:=0;
  131. exit;
  132. end;
  133. maxi:=s1.len;
  134. temp:=s2.len;
  135. if maxi>temp then
  136. maxi:=Temp;
  137. temp:=compareword(s1.data[0],s2.data[0],maxi);
  138. if temp=0 then
  139. temp:=s1.len-s2.len;
  140. comparewidestrings:=temp;
  141. end;
  142. function asciichar2unicode(c : char) : tcompilerwidechar;
  143. var
  144. m : punicodemap;
  145. begin
  146. if (current_settings.sourcecodepage <> CP_UTF8) then
  147. begin
  148. m:=getmap(current_settings.sourcecodepage);
  149. asciichar2unicode:=getunicode(c,m);
  150. end
  151. else
  152. result:=tcompilerwidechar(c);
  153. end;
  154. function unicode2asciichar(c : tcompilerwidechar) : char;
  155. {begin
  156. if word(c)<128 then
  157. unicode2asciichar:=char(word(c))
  158. else
  159. unicode2asciichar:='?';
  160. end;}
  161. begin
  162. Result := getascii(c,getmap(current_settings.sourcecodepage))[1];
  163. end;
  164. procedure ascii2unicode(p : pchar;l : SizeInt;cp : tstringencoding;r : tcompilerwidestring;codepagetranslation : boolean = true);
  165. var
  166. source : pchar;
  167. dest : tcompilerwidecharptr;
  168. i : SizeInt;
  169. m : punicodemap;
  170. begin
  171. m:=getmap(cp);
  172. setlengthwidestring(r,l);
  173. source:=p;
  174. dest:=tcompilerwidecharptr(r.data);
  175. if codepagetranslation then
  176. begin
  177. if cp<>CP_UTF8 then
  178. begin
  179. for i:=1 to l do
  180. begin
  181. dest^:=getunicode(source^,m);
  182. inc(dest);
  183. inc(source);
  184. end;
  185. end
  186. else
  187. begin
  188. r.len:=Utf8ToUnicode(punicodechar(r.data),r.maxlen,p,l);
  189. { -1, because utf8tounicode includes room for a terminating 0 in
  190. its result count }
  191. if r.len>0 then
  192. dec(r.len);
  193. end;
  194. end
  195. else
  196. begin
  197. for i:=1 to l do
  198. begin
  199. dest^:=tcompilerwidechar(source^);
  200. inc(dest);
  201. inc(source);
  202. end;
  203. end;
  204. end;
  205. procedure unicode2ascii(r : tcompilerwidestring;arr:TAnsiCharDynArray;cp : tstringencoding);
  206. begin
  207. if (r.len=0) or (length(arr)=0) then
  208. exit;
  209. unicode2ascii(r,Pchar(@arr[0]),cp);
  210. end;
  211. procedure unicode2ascii(r : tcompilerwidestring;p:pchar;cp : tstringencoding);
  212. var
  213. m : punicodemap;
  214. source : tcompilerwidecharptr;
  215. dest : pchar;
  216. i : longint;
  217. begin
  218. { can't implement that here, because the memory size for p() cannot
  219. be changed here, and we may need more bytes than have been allocated }
  220. if cp=CP_UTF8 then
  221. internalerrorproc(2015092701);
  222. if (cp = 0) or (cp=CP_NONE) then
  223. m:=getmap(current_settings.sourcecodepage)
  224. else
  225. m:=getmap(cp);
  226. source:=tcompilerwidecharptr(r.data);
  227. dest:=p;
  228. for i:=1 to r.len do
  229. begin
  230. dest^ := getascii(source^,m)[1];
  231. inc(dest);
  232. inc(source);
  233. end;
  234. end;
  235. function hasnonasciichars(const p: tcompilerwidestring): boolean;
  236. var
  237. source : tcompilerwidecharptr;
  238. i : longint;
  239. begin
  240. source:=tcompilerwidecharptr(p.data);
  241. result:=true;
  242. for i:=1 to p.len do
  243. begin
  244. if word(source^)>=128 then
  245. exit;
  246. inc(source);
  247. end;
  248. result:=false;
  249. end;
  250. function cpavailable(const s: string): boolean;
  251. begin
  252. result:=mappingavailable(lower(s));
  253. if not result then
  254. result:=(unicodepath<>'')and(registerbinarymapping(unicodepath+'charset',lower(s)));
  255. end;
  256. function cpavailable(cp: word): boolean;
  257. begin
  258. result:=mappingavailable(cp);
  259. if not result then
  260. result:=(unicodepath<>'')and(registerbinarymapping(unicodepath+'charset','cp'+tostr(cp)));
  261. end;
  262. procedure changecodepage(
  263. s : pchar; l : SizeInt; scp : tstringencoding;
  264. d : pchar; dcp : tstringencoding
  265. );
  266. var
  267. ms, md : punicodemap;
  268. source : pchar;
  269. dest : pchar;
  270. i : longint;
  271. begin
  272. ms:=getmap(scp);
  273. md:=getmap(dcp);
  274. source:=s;
  275. dest:=d;
  276. for i:=1 to l do
  277. begin
  278. dest^ := getascii(getunicode(source^,ms),md)[1];
  279. inc(dest);
  280. inc(source);
  281. end;
  282. end;
  283. function codepagebyname(const s : string) : tstringencoding;
  284. var
  285. p : punicodemap;
  286. begin
  287. Result:=0;
  288. p:=getmap(lower(s));
  289. if (p<>nil) then
  290. Result:=p^.cp;
  291. end;
  292. function charlength(p: pchar; len: sizeint): sizeint;
  293. var
  294. p2: pchar;
  295. i, chars, codepointlen: sizeint;
  296. begin
  297. if len=0 then
  298. begin
  299. result:=0;
  300. exit;
  301. end;
  302. { Length of the string converted to a SBCS codepage (e.g. ISO 8859-1)
  303. should be equal to the amount of characters in the source string. }
  304. if defaultsystemcodepage=cp_utf8 then
  305. { ChangeCodePage does not work for UTF-8 apparently... :-( }
  306. begin
  307. i:=1;
  308. chars:=0;
  309. while i<=len do
  310. begin
  311. codepointlen:=utf8codepointlen(p,len-i+1,true);
  312. inc(i,codepointlen);
  313. inc(p,codepointlen);
  314. inc(chars);
  315. end;
  316. result:=chars;
  317. end
  318. else if cpavailable(defaultsystemcodepage) then
  319. begin
  320. getmem(p2,succ(len));
  321. fillchar(p2^,succ(len),0);
  322. changecodepage(p,len,defaultsystemcodepage,p2,28591);
  323. result:=strlen(p2);
  324. freemem(p2,succ(len));
  325. end
  326. else
  327. result:=len;
  328. end;
  329. function charlength(const s: string): sizeint;
  330. begin
  331. result:=charlength(@s[1],length(s));
  332. end;
  333. { tcompilerwidestring }
  334. const
  335. cEmptyUnicodeChar : UnicodeChar = #0;
  336. function tcompilerwidestring.asconstpunicodechar: PUnicodeChar;
  337. begin
  338. if length(data)>0 then
  339. result:=@Data[0]
  340. else
  341. result:=@cEmptyUnicodeChar;
  342. end;
  343. end.