utf8bidi.pp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. {
  2. Author Mazen NEIFER
  3. Licence LGPL
  4. }
  5. unit UTF8BIDI;
  6. {$mode objfpc}{$H+}
  7. interface
  8. uses
  9. FreeBIDI;
  10. type
  11. TUCS32Char = Cardinal;
  12. TUCS16Char = Word;
  13. TUTF8Char = String[4];
  14. TUTF8String = UTF8String;
  15. {****************************Conversion routines*******************************}
  16. {Converts an UCS 16/32 bits charcater to UTF8 character}
  17. function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
  18. {Converts a wide char UCS 16 bits chcarcter to UTF8 character}
  19. function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
  20. {Converts a wide char UCS 16 bits string to UTF8 character}
  21. function UnicodeToUTF8(const Src:TString):TUTF8String;
  22. {Converts an UTF8 character to UCS 32 bits character}
  23. function UTF8ToUCS32(const UTF8Char:TUTF8Char):TUCS32Char;
  24. {Converts an UTF8 character to UCS 16 bits character}
  25. function UTF8ToUCS16(const UTF8Char:TUTF8Char):TUCS16Char;
  26. {Converts an UTF8 string to UCS 16 bits string}
  27. function UTF8ToUnicode(const Src:TUTF8String):TString;
  28. {Converts an UTF8 string to a double byte string}
  29. function UTF8ToDoubleByteString(const UTF8Str:TUTF8String):String;
  30. function UTF8ToDoubleByte(UTF8Str:PChar; Len:Cardinal; DBStr:PByte):Cardinal;
  31. {****************************Logical aspects***********************************}
  32. {Returns the number of logical characters}
  33. function LLength(const UTF8Str:TUTF8String):Cardinal;
  34. {Converts visual position to logical position}
  35. function LPos(const UTF8Str:TUTF8String; vp:Integer; pDir:TDirection):Cardinal;
  36. {Returns character at a given logical position according to paragraph direction}
  37. function LCharOf(UTF8String:TUTF8String; lp:Integer):TUTF8Char;
  38. {****************************Visual aspects************************************}
  39. {Returns the number of visual characters}
  40. function VLength(const Src:TUTF8String; pDir:TDirection):Cardinal;
  41. {Converts a logical position to a visual position}
  42. function VPos(const UTF8Str:TUTF8String; lp:Integer; pDir, cDir:TDirection):Cardinal;
  43. {Returns character at a given visual position according to paragraph direction}
  44. function VCharOf(UTF8Str:TUTF8String; vp:Integer; dir:TDirection):TUTF8Char;
  45. {Inserts a string into an other paying attention of RTL/LTR direction}
  46. procedure VInsert(const Src:TUTF8String; var Dest:TUTF8String; vp:Integer; pDir:TDirection);
  47. {Deletes a string into an other paying attention of RTL/LTR direction}
  48. procedure VDelete(var str:TUTF8String; vp, len:Integer; pDir:TDirection);
  49. {****************************Helper routines***********************************}
  50. {Returns direction of a character}
  51. function DirectionOf(Character:TUTF8Char):TDirection;
  52. {Returns contextual direction of caracter in a string}
  53. function DirectionOf(UTF8String:TUTF8String; lp:Integer; pDir:TDirection):TDirection;
  54. {Inserts a char as if it was typed using keyboard in the most user friendly way.
  55. Returns the new cursor position after insersion depending on the new visual text}
  56. function InsertChar(Src:TUTF8Char; var Dest:TUTF8String; vp:Integer; pDir:TDirection):Integer;
  57. {Returns a table mapping each visual position to its logical position in an UTF8*
  58. string}
  59. function VisualToLogical(const UTF8String:TUTF8String; pDir:TDirection):TVisualToLogical;
  60. implementation
  61. function DumpStr(const s:TUTF8String):String;
  62. var
  63. i:Integer;
  64. begin
  65. Result := '';
  66. for i:= 1 to Length(s) do
  67. case s[i] of
  68. #0..#127:
  69. Result := Result + s[i];
  70. else
  71. Result := Result + '$' + HexStr(Ord(s[i]),2);
  72. end;
  73. end;
  74. function ComputeCharLength(p:PChar):Cardinal;
  75. begin
  76. if ord(p^)<%11000000
  77. then
  78. {regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
  79. Result:=1
  80. else if ((ord(p^) and %11100000) = %11000000)
  81. then
  82. if (ord(p[1]) and %11000000) = %10000000 then
  83. Result:=2
  84. else
  85. Result:=1
  86. else if ((ord(p^) and %11110000) = %11100000)
  87. then
  88. if ((ord(p[1]) and %11000000) = %10000000)
  89. and ((ord(p[2]) and %11000000) = %10000000)
  90. then
  91. Result:=3
  92. else
  93. Result:=1
  94. else if ((ord(p^) and %11111000) = %11110000)
  95. then
  96. if ((ord(p[1]) and %11000000) = %10000000)
  97. and ((ord(p[2]) and %11000000) = %10000000)
  98. and ((ord(p[3]) and %11000000) = %10000000)
  99. then
  100. Result:=4
  101. else
  102. Result:=1
  103. else
  104. Result:=1
  105. end;
  106. {****************************Conversion routines*******************************}
  107. function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
  108. begin
  109. case aChar of
  110. 0..$7f:
  111. begin
  112. Result[1]:=char(aChar);
  113. SetLength(UnicodeToUTF8,1);
  114. end;
  115. $80..$7ff:
  116. begin
  117. Result[1]:=char($c0 or (aChar shr 6));
  118. Result[2]:=char($80 or (aChar and $3f));
  119. SetLength(UnicodeToUTF8,2);
  120. end;
  121. $800..$ffff:
  122. begin
  123. SetLength(Result,3);
  124. Result[1]:=char($e0 or (aChar shr 12));
  125. Result[2]:=char($80 or ((aChar shr 6) and $3f));
  126. Result[3]:=char($80 or (aChar and $3f));
  127. end;
  128. $10000..$1fffff:
  129. begin
  130. SetLength(UnicodeToUTF8,4);
  131. Result[1]:=char($f0 or (aChar shr 18));
  132. Result[2]:=char($80 or ((aChar shr 12) and $3f));
  133. Result[3]:=char($80 or ((aChar shr 6) and $3f));
  134. Result[4]:=char($80 or (aChar and $3f));
  135. end;
  136. else
  137. SetLength(UnicodeToUTF8, 0);
  138. end;
  139. end;
  140. function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
  141. var
  142. c:TUCS16Char absolute aChar;
  143. begin
  144. case c of
  145. 0..$7f:
  146. begin
  147. Result[1]:=char(c);
  148. SetLength(UnicodeToUTF8,1);
  149. end;
  150. $80..$7ff:
  151. begin
  152. Result[1]:=char($c0 or (c shr 6));
  153. Result[2]:=char($80 or (c and $3f));
  154. SetLength(UnicodeToUTF8,2);
  155. end;
  156. else
  157. SetLength(UnicodeToUTF8, 0);
  158. end;
  159. end;
  160. function UnicodeToUTF8(const Src:TString):TUTF8String;
  161. var
  162. vp:Integer;
  163. begin
  164. vp := 1;
  165. Result := '';
  166. for vp := 1 to Length(Src) do
  167. Result += UnicodeToUTF8(Src[vp]);
  168. end;
  169. function UTF8ToUCS32(const UTF8Char:TUTF8Char):TUCS32Char;
  170. begin
  171. case ComputeCharLength(@UTF8Char[1]) of
  172. 1:{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
  173. Result := ord(UTF8Char[1]);
  174. 2:
  175. Result := ((ord(UTF8Char[1]) and %00011111) shl 6)
  176. or (ord(UTF8Char[2]) and %00111111);
  177. 3:
  178. Result := ((ord(UTF8Char[1]) and %00011111) shl 12)
  179. or ((ord(UTF8Char[1]) and %00111111) shl 6)
  180. or (ord(UTF8Char[2]) and %00111111);
  181. 4:
  182. Result := ((ord(UTF8Char[1]) and %00011111) shl 18)
  183. or ((ord(UTF8Char[2]) and %00111111) shl 12)
  184. or ((ord(UTF8Char[3]) and %00111111) shl 6)
  185. or (ord(UTF8Char[4]) and %00111111);
  186. else
  187. Result := $FFFFFFFF;
  188. end
  189. end;
  190. function UTF8ToUCS16(const UTF8Char:TUTF8Char):TUCS16Char;
  191. begin
  192. case Length(UTF8Char) of
  193. 1:{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
  194. Result := ord(UTF8Char[1]);
  195. 2:
  196. Result := ((ord(UTF8Char[1]) and %00011111) shl 6)
  197. or (ord(UTF8Char[2]) and %00111111);
  198. else
  199. Result := $FFFF;
  200. end;
  201. end;
  202. function UTF8ToUnicode(const Src:TUTF8String):TString;
  203. var
  204. lp, vp:Integer;
  205. c:TUTF8Char;
  206. begin
  207. lp := 1;
  208. vp := 0;
  209. SetLength(Result, Length(Src));
  210. while lp <= Length(Src) do
  211. begin
  212. vp += 1;
  213. c := LCharOf(Src, lp);
  214. Result[vp] := WideChar(UTF8ToUCS16(c));
  215. lp += Length(c);
  216. end;
  217. SetLength(Result, vp);
  218. end;
  219. function UTF8ToDoubleByteString(const UTF8Str: TUTF8String): string;
  220. var
  221. Len: Integer;
  222. begin
  223. Len:=VLength(UTF8Str, drLTR);
  224. SetLength(Result,Len*2);
  225. if Len=0 then exit;
  226. UTF8ToDoubleByte(PChar(UTF8Str),length(UTF8Str),PByte(Result));
  227. end;
  228. function UTF8ToDoubleByte(UTF8Str: PChar; Len:Cardinal; DBStr: PByte):Cardinal;
  229. var
  230. SrcPos: PChar;
  231. CharLen: LongInt;
  232. DestPos: PByte;
  233. u: Cardinal;
  234. begin
  235. SrcPos:=UTF8Str;
  236. DestPos:=DBStr;
  237. Result:=0;
  238. while Len>0 do begin
  239. u:=UTF8ToUCS32(SrcPos);
  240. DestPos^:=byte((u shr 8) and $ff);
  241. inc(DestPos);
  242. DestPos^:=byte(u and $ff);
  243. inc(DestPos);
  244. inc(SrcPos,CharLen);
  245. dec(Len,CharLen);
  246. inc(Result);
  247. end;
  248. end;
  249. {****************************Logical aspects***********************************}
  250. function LLength(const UTF8Str:TUTF8String):Cardinal;
  251. begin
  252. Result := Length(UTF8Str);
  253. end;
  254. function LPos(const UTF8Str:TUTF8String; vp:Integer; pDir:TDirection):Cardinal;
  255. var
  256. v2l:TVisualToLogical;
  257. i:integer;
  258. begin
  259. v2l := VisualToLogical(UTF8Str, pDir);
  260. if vp <= v2l[0]
  261. then
  262. Result := v2l[vp]
  263. else
  264. Result := Length(UTF8Str) + 1;
  265. end;
  266. function LCharOf(UTF8String:TUTF8String; lp:Integer):TUTF8Char;
  267. begin
  268. if lp > Length(UTF8String)
  269. then
  270. Exit('');
  271. while(lp > 0) and ((Ord(UTF8String[lp]) and $F0) in [$80..$B0]) do
  272. begin
  273. Dec(lp);
  274. end;
  275. if lp = 0
  276. then
  277. Exit('');
  278. Move(UTF8String[lp], Result[1], SizeOf(TUTF8Char) - 1);
  279. SetLength(Result, ComputeCharLength(@Result[1]));
  280. end;
  281. {****************************Visual aspects************************************}
  282. function VLength(const Src:TUTF8String; pDir:TDirection):Cardinal;
  283. begin
  284. Result := FreeBIDI.VLength(UTF8ToUnicode(Src), pDir);
  285. end;
  286. function VPos(const UTF8Str:TUTF8String; lp:Integer; pDir, cDir:TDirection):Cardinal;
  287. var
  288. v2l:TVisualToLogical;
  289. vp:Integer;
  290. begin
  291. v2l := VisualToLogical(UTF8Str, pDir);
  292. for vp := 1 to v2l[0] do
  293. if lp = v2l[vp]
  294. then
  295. begin
  296. Exit(vp);
  297. end;
  298. Result := v2l[0];
  299. end;
  300. function VPos(UTF8Char:PChar; Len:integer; BytePos:integer):Cardinal;
  301. begin
  302. end;
  303. function VCharOf(UTF8Str:TUTF8String; vp:Integer; dir:TDirection):TUTF8Char;
  304. var
  305. CharLen: LongInt;
  306. begin
  307. Result:=LCharOf(UTF8Str,LPos(UTF8Str, vp, dir));
  308. end;
  309. {****************************Helper routines***********************************}
  310. function DirectionOf(Character:TUTF8Char):TDirection;
  311. begin
  312. case Character[1] of
  313. #9,#32,
  314. '/',
  315. '{','}',
  316. '[',']',
  317. '(',')':
  318. Result := drNONE;
  319. #$D8,#$D9:
  320. Result := drRTL;
  321. else
  322. Result := drLTR;
  323. end;
  324. end;
  325. function DirectionOf(UTF8String:TUTF8String; lp:Integer; pDir:TDirection):TDirection;
  326. var
  327. c:TUTF8Char;
  328. lDir,rDir:TDirection;
  329. p:Integer;
  330. begin
  331. if(lp <= 0)
  332. then
  333. lp := 1;
  334. {Seek for proper character direction}
  335. c := LCharOf(UTF8String, lp);
  336. lDir := DirectionOf(c);
  337. {Seek for left character direction if it is neutral}
  338. p := lp;
  339. while(p > 1) and (lDir = drNONE)do
  340. begin
  341. c := LCharOf(UTF8String, p - 1);
  342. lDir := DirectionOf(c);
  343. p := p - Length(c);
  344. end;
  345. {Seek for right character direction if it is neutral}
  346. p := lp;
  347. repeat
  348. c := LCharOf(UTF8String, p);
  349. rDir := DirectionOf(c);
  350. p := p + Length(c);
  351. until(p > Length(UTF8String)) or (rDir <> drNONE);
  352. if(lDir = rDir)
  353. then
  354. Result := rDir
  355. else
  356. Result := pDir;
  357. end;
  358. function VisualToLogical(const UTF8String:TUTF8String; pDir:TDirection):TVisualToLogical;
  359. procedure Insert(value:Byte; var v2l:TVisualToLogical; InsPos:Byte);
  360. var
  361. l:Byte;
  362. begin
  363. if v2l[0] < 255
  364. then
  365. Inc(InsPos);
  366. if InsPos > v2l[0]
  367. then
  368. InsPos := v2l[0];
  369. for l := v2l[0] downto InsPos do
  370. v2l[l] := v2l[l-1];
  371. v2l[InsPos] := Value;
  372. end;
  373. var
  374. lp, vp : Integer;
  375. cDir,lDir:TDirection;
  376. Character:TUTF8Char;
  377. i:Integer;
  378. begin
  379. Result[0] := 0;
  380. lp := 1;
  381. vp := 1;
  382. lDir := drNONE;
  383. while lp <= Length(UTF8String) do
  384. begin
  385. Character := LCharOf(UTF8String, lp);
  386. cDir := DirectionOf(UTF8String, lp, pDir);
  387. Inc(Result[0]);
  388. case cDir of
  389. drRTL:
  390. begin
  391. lDir := drRTL;
  392. end;
  393. drLTR:
  394. begin
  395. lDir := drLTR;
  396. vp := Result[0];
  397. end;
  398. else
  399. vp := Result[0];
  400. end;
  401. Insert(lp, Result, vp);
  402. Inc(lp, Length(Character));
  403. end;
  404. end;
  405. function InsertChar(Src:TUTF8Char; var Dest:TUTF8String; vp:Integer; pDir:TDirection):Integer;
  406. var
  407. temp:TString;
  408. c:TCharacter;
  409. begin
  410. temp := UTF8ToUnicode(Dest);
  411. c := WideChar(UTF8ToUCS16(Src));
  412. Result := FreeBIDI.InsertChar(c, temp, vp, pDir);
  413. Dest := UnicodeToUTF8(temp);
  414. end;
  415. procedure VInsert(const Src:TUTF8String;var Dest:TUTF8String; vp:Integer; pDir:TDirection);
  416. function VStr(const Src:TUTF8String; pDir:TDirection):TUTF8String;
  417. var
  418. v2lSrc:TVisualToLogical;
  419. i:Integer;
  420. begin
  421. v2lSrc := VisualToLogical(Src,pDir);
  422. Result := '';
  423. for i := 1 to v2lSrc[0] do
  424. Result := Result + LCharOf(Src,v2lSrc[i]);
  425. end;
  426. var
  427. vSrc,vDest:TUTF8String;
  428. begin
  429. vSrc := VStr(Src,pDir);
  430. vDest := VStr(Dest,pDir);
  431. Insert(vSrc, vDest, vp);
  432. Dest := VStr(vDest, pDir);
  433. end;
  434. procedure VDelete(var str:TUTF8String; vp, len:Integer; pDir:TDirection);
  435. var
  436. temp:TString;
  437. begin
  438. temp := UTF8ToUnicode(str);
  439. FreeBIDI.VDelete(temp, vp, len, pDir);
  440. str := UnicodeToUTF8(temp);
  441. end;
  442. end.