charset.pp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2000 by Florian Klaempfl
  4. member of the Free Pascal development team.
  5. This unit implements several classes for charset conversions
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$pointermath on}
  14. unit charset;
  15. interface
  16. type
  17. tunicodechar = word;
  18. tunicodestring = ^tunicodechar;
  19. tcsconvert = class
  20. // !!!!!!1constructor create;
  21. end;
  22. tunicodecharmappingflag = (umf_noinfo,umf_leadbyte,umf_undefined,
  23. umf_unused);
  24. punicodecharmapping = ^tunicodecharmapping;
  25. tunicodecharmapping = record
  26. unicode : tunicodechar;
  27. flag : tunicodecharmappingflag;
  28. reserved : byte;
  29. end;
  30. preversecharmapping = ^treversecharmapping;
  31. treversecharmapping = record
  32. unicode : tunicodechar;
  33. char1 : Byte;
  34. char2 : Byte;
  35. end;
  36. punicodemap = ^tunicodemap;
  37. tunicodemap = record
  38. cpname : string[20];
  39. cp : word;
  40. map : punicodecharmapping;
  41. lastchar : longint;
  42. reversemap : preversecharmapping;
  43. reversemaplength : longint;
  44. next : punicodemap;
  45. internalmap : boolean;
  46. end;
  47. tcp2unicode = class(tcsconvert)
  48. end;
  49. function loadunicodemapping(const cpname,f : string; cp :word) : punicodemap;
  50. procedure registermapping(p : punicodemap);
  51. function getmap(const s : string) : punicodemap;
  52. function getmap(cp : word) : punicodemap;
  53. function mappingavailable(const s : string) : boolean;inline;
  54. function mappingavailable(cp :word) : boolean;inline;
  55. function getunicode(c : char;p : punicodemap) : tunicodechar;inline;
  56. function getunicode(
  57. AAnsiStr : pansichar;
  58. AAnsiLen : LongInt;
  59. AMap : punicodemap;
  60. ADest : tunicodestring
  61. ) : LongInt;
  62. function getascii(c : tunicodechar;p : punicodemap) : string;
  63. function getascii(c : tunicodechar;p : punicodemap; ABuffer : PAnsiChar; ABufferLen : LongInt) : LongInt;
  64. implementation
  65. const
  66. UNKNOW_CHAR_A = ansichar(63);
  67. UNKNOW_CHAR_W = tunicodechar(63);
  68. var
  69. mappings : punicodemap;
  70. procedure QuickSort(AList: preversecharmapping; L, R : Longint);
  71. var
  72. I, J : Longint;
  73. P, Q : treversecharmapping;
  74. begin
  75. repeat
  76. I:=L;
  77. J:=R;
  78. P:=AList[(L + R) div 2];
  79. repeat
  80. while (P.unicode-AList[I].unicode) > 0 do
  81. I:=I+1;
  82. while (P.unicode-AList[J].unicode) < 0 do
  83. J:=J-1;
  84. if I<=J then
  85. begin
  86. Q:=AList[I];
  87. AList[I]:=AList[J];
  88. AList[J]:=Q;
  89. I:=I+1;
  90. J:=J-1;
  91. end;
  92. until I > J;
  93. if J-L < R-I then
  94. begin
  95. if L<J then
  96. QuickSort(AList, L, J);
  97. L:=I;
  98. end
  99. else
  100. begin
  101. if I < R then
  102. QuickSort(AList, I, R);
  103. R:=J;
  104. end;
  105. until L>=R;
  106. end;
  107. function find(
  108. const c : tunicodechar;
  109. const AData : preversecharmapping;
  110. const ALen : LongInt
  111. ) : preversecharmapping;overload;
  112. var
  113. l, h, m : longint;
  114. r:preversecharmapping;
  115. begin
  116. if ALen=0 then
  117. exit(nil);
  118. r:=AData;
  119. l:=0;
  120. h:=ALen-1;
  121. while l<h do begin
  122. m:=(l+h) div 2;
  123. if r[m].unicode<c then
  124. l:=m+1
  125. else
  126. h:=m;
  127. end;
  128. if (l=h) and (r[l].unicode=c) then
  129. Result:=@r[l]
  130. else
  131. Result:=nil;
  132. end;
  133. function find(
  134. const c : tunicodechar;
  135. const p : punicodemap
  136. ) : preversecharmapping;overload;inline;
  137. begin
  138. Result:=find(c,p^.reversemap,p^.reversemaplength);
  139. end;
  140. function RemoveDuplicates(
  141. const AData : preversecharmapping;
  142. const ALen : LongInt;
  143. out AResultLen : LongInt
  144. ) : preversecharmapping;
  145. var
  146. r0, r, p, t : preversecharmapping;
  147. i, c, actualCount : LongInt;
  148. begin
  149. c:=ALen;
  150. GetMem(r0,c*SizeOf(treversecharmapping));
  151. r:=r0;
  152. p:=AData;
  153. actualCount:=0;
  154. i:=0;
  155. while i<c do
  156. begin
  157. t:=find(p^.unicode,r0,actualCount);
  158. if t=nil then
  159. begin
  160. r^:=p^;
  161. actualCount:=actualCount+1;
  162. Inc(r);
  163. end
  164. else
  165. begin
  166. if (p^.char1<t^.char1) or
  167. ((p^.char1=t^.char1) and (p^.char2<t^.char2))
  168. then
  169. t^:=p^;//keep the first mapping
  170. end;
  171. i:=i+1;
  172. Inc(p);
  173. end;
  174. if c<>actualCount then
  175. ReAllocMem(r0,actualCount*SizeOf(treversecharmapping));
  176. AResultLen:=actualCount;
  177. Result:=r0;
  178. end;
  179. function buildreversemap(
  180. const AMapping : punicodecharmapping;
  181. const ALen : LongInt;
  182. out AResultLen : LongInt
  183. ) : preversecharmapping;
  184. var
  185. r0, r, t : preversecharmapping;
  186. i, c, actualCount, ti : LongInt;
  187. p : punicodecharmapping;
  188. begin
  189. if (ALen<1) then
  190. exit(nil);
  191. p:=AMapping;
  192. c:=ALen;
  193. GetMem(r0,c*SizeOf(treversecharmapping));
  194. r:=r0;
  195. actualCount:=0;
  196. i:=0;
  197. while i<c do
  198. begin
  199. if (p^.flag=umf_noinfo) then
  200. begin
  201. r^.unicode:=p^.unicode;
  202. if i<=High(Byte) then
  203. begin
  204. r^.char1:=i;
  205. r^.char2:=0;
  206. end
  207. else
  208. begin
  209. r^.char1:=i div 256;
  210. r^.char2:=i mod 256;
  211. end;
  212. actualCount:=actualCount+1;
  213. Inc(r);
  214. end;
  215. Inc(p);
  216. i:=i+1;
  217. end;
  218. if c<>actualCount then
  219. ReAllocMem(r0,actualCount*SizeOf(treversecharmapping));
  220. if actualCount>1 then
  221. begin
  222. QuickSort(r0,0,(actualCount-1));
  223. t:=RemoveDuplicates(r0,actualCount,ti);
  224. FreeMem(r0,actualCount*SizeOf(treversecharmapping));
  225. r0:=t;
  226. actualCount:=ti;
  227. end;
  228. AResultLen:=actualCount;
  229. Result:=r0;
  230. end;
  231. procedure inititems(const p : punicodecharmapping; const ALen : LongInt);
  232. const
  233. INIT_ITEM : tunicodecharmapping = (unicode:0; flag:umf_unused; reserved:0);
  234. var
  235. x : punicodecharmapping;
  236. i : LongInt;
  237. begin
  238. x:=p;
  239. for i:=0 to ALen-1 do
  240. begin
  241. x^:=INIT_ITEM;
  242. Inc(x);
  243. end;
  244. end;
  245. function loadunicodemapping(const cpname,f : string; cp :word) : punicodemap;
  246. var
  247. data : punicodecharmapping;
  248. datasize : longint;
  249. t : text;
  250. s,hs : string;
  251. scanpos,charpos,unicodevalue : longint;
  252. code : word;
  253. flag : tunicodecharmappingflag;
  254. p : punicodemap;
  255. lastchar, i : longint;
  256. begin
  257. lastchar:=-1;
  258. loadunicodemapping:=nil;
  259. datasize:=256;
  260. GetMem(data,sizeof(tunicodecharmapping)*datasize);
  261. inititems(data,datasize);
  262. assign(t,f);
  263. {$I-}
  264. reset(t);
  265. {$I+}
  266. if ioresult<>0 then
  267. begin
  268. freemem(data,sizeof(tunicodecharmapping)*datasize);
  269. exit;
  270. end;
  271. while not(eof(t)) do
  272. begin
  273. readln(t,s);
  274. if (s[1]='0') and (s[2]='x') then
  275. begin
  276. flag:=umf_unused;
  277. scanpos:=3;
  278. hs:='$';
  279. while s[scanpos] in ['0'..'9','A'..'F','a'..'f'] do
  280. begin
  281. hs:=hs+s[scanpos];
  282. inc(scanpos);
  283. end;
  284. val(hs,charpos,code);
  285. if code<>0 then
  286. begin
  287. freemem(data,sizeof(tunicodecharmapping)*datasize);
  288. close(t);
  289. exit;
  290. end;
  291. while not(s[scanpos] in ['0','#']) do
  292. inc(scanpos);
  293. if s[scanpos]='#' then
  294. begin
  295. { special char }
  296. unicodevalue:=$ffff;
  297. hs:=copy(s,scanpos,length(s)-scanpos+1);
  298. if hs='#DBCS LEAD BYTE' then
  299. flag:=umf_leadbyte;
  300. end
  301. else
  302. begin
  303. { C hex prefix }
  304. inc(scanpos,2);
  305. hs:='$';
  306. while s[scanpos] in ['0'..'9','A'..'F','a'..'f'] do
  307. begin
  308. hs:=hs+s[scanpos];
  309. inc(scanpos);
  310. end;
  311. val(hs,unicodevalue,code);
  312. if code<>0 then
  313. begin
  314. freemem(data,sizeof(tunicodecharmapping)*datasize);
  315. close(t);
  316. exit;
  317. end;
  318. if charpos>datasize then
  319. begin
  320. { allocate 1024 bytes more because }
  321. { if we need more than 256 entries it's }
  322. { probably a mbcs with a lot of }
  323. { entries }
  324. i:=datasize;
  325. datasize:=charpos+8*1024;
  326. reallocmem(data,sizeof(tunicodecharmapping)*datasize);
  327. inititems(@data[i],(datasize-i));
  328. end;
  329. flag:=umf_noinfo;
  330. end;
  331. data[charpos].flag:=flag;
  332. data[charpos].unicode:=unicodevalue;
  333. if charpos>lastchar then
  334. lastchar:=charpos;
  335. end;
  336. end;
  337. close(t);
  338. new(p);
  339. p^.lastchar:=lastchar;
  340. p^.cpname:=cpname;
  341. p^.cp:=cp;
  342. p^.internalmap:=false;
  343. p^.next:=nil;
  344. p^.map:=data;
  345. p^.reversemap:=buildreversemap(p^.map,(p^.lastchar+1),p^.reversemaplength);
  346. loadunicodemapping:=p;
  347. end;
  348. procedure registermapping(p : punicodemap);
  349. begin
  350. p^.next:=mappings;
  351. mappings:=p;
  352. end;
  353. {$ifdef FPC_HAS_FEATURE_THREADING}
  354. threadvar
  355. {$else FPC_HAS_FEATURE_THREADING}
  356. var
  357. {$endif FPC_HAS_FEATURE_THREADING}
  358. strmapcache : string;
  359. strmapcachep : punicodemap;
  360. function getmap(const s : string) : punicodemap;
  361. var
  362. hp : punicodemap;
  363. begin
  364. if (strmapcache=s) and assigned(strmapcachep) and (strmapcachep^.cpname=s) then
  365. begin
  366. getmap:=strmapcachep;
  367. exit;
  368. end;
  369. hp:=mappings;
  370. while assigned(hp) do
  371. begin
  372. if hp^.cpname=s then
  373. begin
  374. getmap:=hp;
  375. strmapcache:=s;
  376. strmapcachep:=hp;
  377. exit;
  378. end;
  379. hp:=hp^.next;
  380. end;
  381. getmap:=nil;
  382. end;////////
  383. {$ifdef FPC_HAS_FEATURE_THREADING}
  384. threadvar
  385. {$else FPC_HAS_FEATURE_THREADING}
  386. var
  387. {$endif FPC_HAS_FEATURE_THREADING}
  388. intmapcache : word;
  389. intmapcachep : punicodemap;
  390. function getmap(cp : word) : punicodemap;
  391. var
  392. hp : punicodemap;
  393. begin
  394. if (intmapcache=cp) and assigned(intmapcachep) and (intmapcachep^.cp=cp) then
  395. begin
  396. getmap:=intmapcachep;
  397. exit;
  398. end;
  399. hp:=mappings;
  400. while assigned(hp) do
  401. begin
  402. if hp^.cp=cp then
  403. begin
  404. getmap:=hp;
  405. intmapcache:=cp;
  406. intmapcachep:=hp;
  407. exit;
  408. end;
  409. hp:=hp^.next;
  410. end;
  411. getmap:=nil;
  412. end;
  413. function mappingavailable(const s : string) : boolean;
  414. begin
  415. mappingavailable:=getmap(s)<>nil;
  416. end;
  417. function mappingavailable(cp : word) : boolean;
  418. begin
  419. mappingavailable:=getmap(cp)<>nil;
  420. end;
  421. function getunicode(c : char;p : punicodemap) : tunicodechar;
  422. begin
  423. if ord(c)<=p^.lastchar then
  424. getunicode:=p^.map[ord(c)].unicode
  425. else
  426. getunicode:=0;
  427. end;
  428. function getunicode(
  429. AAnsiStr : pansichar;
  430. AAnsiLen : LongInt;
  431. AMap : punicodemap;
  432. ADest : tunicodestring
  433. ) : LongInt;
  434. var
  435. i, c, k, destLen : longint;
  436. ps : pansichar;
  437. pd : ^tunicodechar;
  438. begin
  439. if (AAnsiStr=nil) or (AAnsiLen<=0) then
  440. exit(0);
  441. ps:=AAnsiStr;
  442. if (ADest=nil) then
  443. begin
  444. c:=AAnsiLen-1;
  445. destLen:=0;
  446. i:=0;
  447. while (i<=c) do
  448. begin
  449. if (ord(ps^)<=AMap^.lastchar) then
  450. begin
  451. if (AMap^.map[ord(ps^)].flag=umf_leadbyte) and (i<c) then
  452. begin
  453. Inc(ps);
  454. i:=i+1;
  455. end;
  456. end;
  457. i:=i+1;
  458. Inc(ps);
  459. destLen:=destLen+1;
  460. end;
  461. exit(destLen);
  462. end;
  463. pd:=ADest;
  464. c:=AAnsiLen-1;
  465. i:=0;
  466. while (i<=c) do
  467. begin
  468. if (ord(ps^)<=AMap^.lastchar) then
  469. begin
  470. if (AMap^.map[ord(ps^)].flag=umf_leadbyte) then
  471. begin
  472. if (i<c) then
  473. begin
  474. k:=(Ord(ps^)*256);
  475. Inc(ps);
  476. i:=i+1;
  477. k:=k+Ord(ps^);
  478. if (k<=AMap^.lastchar) then
  479. pd^:=AMap^.map[k].unicode
  480. else
  481. pd^:=UNKNOW_CHAR_W;
  482. end
  483. else
  484. pd^:=UNKNOW_CHAR_W;
  485. end
  486. else
  487. pd^:=AMap^.map[ord(ps^)].unicode
  488. end
  489. else
  490. pd^:=UNKNOW_CHAR_W;
  491. i:=i+1;
  492. Inc(ps);
  493. Inc(pd);
  494. end;
  495. result:=((PtrUInt(pd)-PtrUInt(ADest)) div SizeOf(tunicodechar));
  496. end;
  497. function getascii(c : tunicodechar;p : punicodemap) : string;
  498. var
  499. rm : preversecharmapping;
  500. begin
  501. rm:=find(c,p);
  502. if rm<>nil then
  503. begin
  504. if rm^.char2=0 then
  505. begin
  506. SetLength(Result,1);
  507. Byte(Result[1]):=rm^.char1;
  508. end
  509. else
  510. begin
  511. SetLength(Result,2);
  512. Byte(Result[1]):=rm^.char1;
  513. Byte(Result[2]):=rm^.char2;
  514. end;
  515. end
  516. else
  517. Result:=UNKNOW_CHAR_A;
  518. end;
  519. function getascii(c : tunicodechar;p : punicodemap; ABuffer : PAnsiChar; ABufferLen : LongInt) : LongInt;
  520. var
  521. rm : preversecharmapping;
  522. begin
  523. if (ABuffer<>nil) and (ABufferLen<=0) then
  524. exit(-1);
  525. rm:=find(c,p);
  526. if rm<>nil then
  527. begin
  528. if (ABuffer=nil) then
  529. begin
  530. if rm^.char2=0 then
  531. Result:=1
  532. else
  533. Result:=2;
  534. end
  535. else
  536. begin
  537. if rm^.char2=0 then
  538. begin
  539. Byte(ABuffer^):=rm^.char1;
  540. Result:=1;
  541. end
  542. else
  543. begin
  544. if (ABufferLen<2) then
  545. Result:=-1
  546. else
  547. begin
  548. Byte(ABuffer^):=rm^.char1;
  549. Byte((ABuffer+1)^):=rm^.char2;
  550. Result:=2;
  551. end
  552. end;
  553. end;
  554. end
  555. else
  556. begin
  557. ABuffer^:=UNKNOW_CHAR_A;
  558. Result:=1;
  559. end;
  560. end;
  561. var
  562. hp : punicodemap;
  563. initialization
  564. mappings:=nil;
  565. finalization
  566. while assigned(mappings) do
  567. begin
  568. hp:=mappings^.next;
  569. if not(mappings^.internalmap) then
  570. begin
  571. freemem(mappings^.map);
  572. freemem(mappings^.reversemap);
  573. dispose(mappings);
  574. end;
  575. mappings:=hp;
  576. end;
  577. end.