utf8bidi.pp 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. {
  2. Author Mazen NEIFER
  3. Licence LGPL
  4. }
  5. unit UTF8BIDI;
  6. {$mode objfpc}{$H+}
  7. interface
  8. type
  9. TUCS32Char = Cardinal;
  10. TUCS16Char = Word;
  11. TUTF8Char = String[4];
  12. TUTF8String = UTF8String;
  13. TDirection=(
  14. drNONE,
  15. drRTL,
  16. drLTR
  17. );
  18. {****************************Conversion routines*******************************}
  19. {Converts an UCS 16/32 bits charcater to UTF8 character}
  20. function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
  21. {Converts a wide char UCS 16 bits chcarcter to UTF8 character}
  22. function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
  23. {Converts an UTF8 character to UCS 32 bits character}
  24. function UTF8ToUnicode(const UTF8Char:TUTF8Char):TUCS32Char;
  25. {Converts an UTF8 string to a double byte string}
  26. function UTF8ToDoubleByteString(const UTF8Str:TUTF8String):String;
  27. function UTF8ToDoubleByte(UTF8Str:PChar; Len:Cardinal; DBStr:PByte):Cardinal;
  28. {****************************Logical aspects***********************************}
  29. {Returns the number of logical characters}
  30. function LLength(const UTF8Str:TUTF8String):Cardinal;
  31. {Converts visual position to logical position}
  32. function LPos(const UTF8Str:TUTF8String; vp:Cardinal; pDir:TDirection):Cardinal;
  33. {Returns character at a given logical position according to paragraph direction}
  34. function LCharOf(UTF8Str:TUTF8String; lp:Cardinal):TUTF8Char;
  35. {****************************Visual aspects************************************}
  36. {Returns the number of visual characters}
  37. function VLength(const UTF8Str:TUTF8String):Cardinal;
  38. function VLength(p: PChar; Count:Cardinal):Cardinal;
  39. {Converts a logical position to a visual position}
  40. function VPos(const UTF8Str:TUTF8String; lp:Cardinal; pDir, cDir:TDirection):Cardinal;
  41. function VPos(UTF8Char:PChar; Len:integer; BytePos:integer):Cardinal;
  42. {Returns character at a given visual position according to paragraph direction}
  43. function VCharOf(UTF8Str:TUTF8String; vp:Cardinal; dir:TDirection):TUTF8Char;
  44. implementation
  45. function ComputeCharLength(p:PChar):Cardinal;
  46. begin
  47. if ord(p^)<%11000000
  48. then
  49. {regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
  50. Result:=1
  51. else if ((ord(p^) and %11100000) = %11000000)
  52. then
  53. if (ord(p[1]) and %11000000) = %10000000 then
  54. Result:=2
  55. else
  56. Result:=1
  57. else if ((ord(p^) and %11110000) = %11100000)
  58. then
  59. if ((ord(p[1]) and %11000000) = %10000000)
  60. and ((ord(p[2]) and %11000000) = %10000000)
  61. then
  62. Result:=3
  63. else
  64. Result:=1
  65. else if ((ord(p^) and %11111000) = %11110000)
  66. then
  67. if ((ord(p[1]) and %11000000) = %10000000)
  68. and ((ord(p[2]) and %11000000) = %10000000)
  69. and ((ord(p[3]) and %11000000) = %10000000)
  70. then
  71. Result:=4
  72. else
  73. Result:=1
  74. else
  75. Result:=1
  76. end;
  77. {****************************Conversion routines*******************************}
  78. function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
  79. begin
  80. case aChar of
  81. 0..$7f:
  82. begin
  83. Result[1]:=char(aChar);
  84. SetLength(UnicodeToUTF8,1);
  85. end;
  86. $80..$7ff:
  87. begin
  88. Result[1]:=char($c0 or (aChar shr 6));
  89. Result[2]:=char($80 or (aChar and $3f));
  90. SetLength(UnicodeToUTF8,2);
  91. end;
  92. $800..$ffff:
  93. begin
  94. SetLength(Result,3);
  95. Result[1]:=char($e0 or (aChar shr 12));
  96. Result[2]:=char($80 or ((aChar shr 6) and $3f));
  97. Result[3]:=char($80 or (aChar and $3f));
  98. end;
  99. $10000..$1fffff:
  100. begin
  101. SetLength(UnicodeToUTF8,4);
  102. Result[1]:=char($f0 or (aChar shr 18));
  103. Result[2]:=char($80 or ((aChar shr 12) and $3f));
  104. Result[3]:=char($80 or ((aChar shr 6) and $3f));
  105. Result[4]:=char($80 or (aChar and $3f));
  106. end;
  107. else
  108. SetLength(UnicodeToUTF8, 0);
  109. end;
  110. end;
  111. function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
  112. begin
  113. Result := UnicodeToUTF8(Word(aChar));
  114. end;
  115. function UTF8ToUnicode(const UTF8Char:TUTF8Char):TUCS32Char;
  116. begin
  117. case ComputeCharLength(@UTF8Char[1]) of
  118. 1:{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
  119. Result := ord(UTF8Char[1]);
  120. 2:
  121. Result := ((ord(UTF8Char[1]) and %00011111) shl 6)
  122. or (ord(UTF8Char[2]) and %00111111);
  123. 3:
  124. Result := ((ord(UTF8Char[1]) and %00011111) shl 12)
  125. or ((ord(UTF8Char[1]) and %00111111) shl 6)
  126. or (ord(UTF8Char[2]) and %00111111);
  127. 4:
  128. Result := ((ord(UTF8Char[1]) and %00011111) shl 18)
  129. or ((ord(UTF8Char[2]) and %00111111) shl 12)
  130. or ((ord(UTF8Char[3]) and %00111111) shl 6)
  131. or (ord(UTF8Char[4]) and %00111111);
  132. else
  133. Result := $FFFFFFFF;
  134. end
  135. end;
  136. function UTF8ToDoubleByteString(const UTF8Str: TUTF8String): string;
  137. var
  138. Len: Integer;
  139. begin
  140. Len:=VLength(UTF8Str);
  141. SetLength(Result,Len*2);
  142. if Len=0 then exit;
  143. UTF8ToDoubleByte(PChar(UTF8Str),length(UTF8Str),PByte(Result));
  144. end;
  145. function UTF8ToDoubleByte(UTF8Str: PChar; Len:Cardinal; DBStr: PByte):Cardinal;
  146. var
  147. SrcPos: PChar;
  148. CharLen: LongInt;
  149. DestPos: PByte;
  150. u: Cardinal;
  151. begin
  152. SrcPos:=UTF8Str;
  153. DestPos:=DBStr;
  154. Result:=0;
  155. while Len>0 do begin
  156. u:=UTF8ToUnicode(SrcPos);
  157. DestPos^:=byte((u shr 8) and $ff);
  158. inc(DestPos);
  159. DestPos^:=byte(u and $ff);
  160. inc(DestPos);
  161. inc(SrcPos,CharLen);
  162. dec(Len,CharLen);
  163. inc(Result);
  164. end;
  165. end;
  166. {****************************Logical aspects***********************************}
  167. function LLength(const UTF8Str:TUTF8String):Cardinal;
  168. begin
  169. Result := Length(UTF8Str);
  170. end;
  171. function LPos(const UTF8Str:TUTF8String; vp:Cardinal; pDir:TDirection):Cardinal;
  172. var
  173. {At beginning of the line we don't know which direction, thus the first
  174. character usually decides of paragrph direction}
  175. LeftCursorPos, RightCursorPos:Integer;
  176. uLen:Integer;
  177. begin
  178. uLen := Length(UTF8Str);
  179. LeftCursorPos := 1;
  180. RightCursorPos := 1;
  181. Result := 1;
  182. if(uLen > 0) then
  183. repeat
  184. case UTF8Str[Result] of
  185. #32,'{','}','/'://Does not change direction, this is a neutral character;
  186. begin
  187. if(pDir = drLTR) then
  188. Inc(RightCursorPos);
  189. end;
  190. #$d8,#$d9://Arabic
  191. begin
  192. pDir := drRTL;
  193. Inc(Result);//Consume control character
  194. end;
  195. else //Latin
  196. begin
  197. pDir := drLTR;
  198. RightCursorPos := LeftCursorPos + 1;
  199. end;
  200. end;
  201. Inc(LeftCursorPos);
  202. Inc(Result);
  203. until(Result > uLen) or
  204. ((pDir = drLTR) and (LeftCursorPos > vp)) or
  205. ((pDir = drRTL) and (RightCursorPos > vp));
  206. //WriteLn('uLen=',uLen,' Result=',Result,' CursorPos=',CursorPos,' LeftCursorPos=',LeftCursorPos,' RightCursorPos=',RightCursorPos);
  207. if(Result > uLen)
  208. then begin
  209. if(vp > LeftCursorPos) then begin
  210. Inc(Result, vp - LeftCursorPos);
  211. LeftCursorPos := vp;
  212. end;
  213. Inc(LeftCursorPos);
  214. if(vp > RightCursorPos) then
  215. if(pDir = drLTR) then
  216. RightCursorPos := vp;
  217. end;
  218. //WriteLn('CursorPos=',CursorPos,' LeftCursorPos=',LeftCursorPos,' RightCursorPos=',RightCursorPos);
  219. Result := Result;
  220. end;
  221. function LCharOf(UTF8Str:TUTF8String; lp:Cardinal):TUTF8Char;
  222. begin
  223. while(lp > 0) and (UTF8Str[lp] > #128) do
  224. Dec(lp);
  225. if lp = 0
  226. then
  227. Exit('');
  228. Move(Result, UTF8Str[lp], SizeOf(Result));
  229. SetLength(Result, ComputeCharLength(@Result[1]));
  230. end;
  231. {****************************Visual aspects************************************}
  232. function VLength(const UTF8Str:TUTF8String):Cardinal;
  233. begin
  234. Result := VLength(PChar(UTF8Str),LLength(UTF8Str));
  235. end;
  236. function VLength(p:PChar; Count:Cardinal):Cardinal;
  237. var
  238. CharLen: LongInt;
  239. begin
  240. VLength:=0;
  241. while (Count>0) do begin
  242. inc(Result);
  243. CharLen:=ComputeCharLength(p);
  244. inc(p,CharLen);
  245. dec(Count,CharLen);
  246. end;
  247. end;
  248. function VPos(const UTF8Str:TUTF8String; lp:Cardinal; pDir, cDir:TDirection):Cardinal;
  249. var
  250. {At beginning of the line we don't know which direction, thus the first
  251. character usually decides of paragrph direction}
  252. LeftCursorPos, RightCursorPos:Integer;
  253. uLen:Integer;
  254. begin
  255. uLen := Length(UTF8Str);
  256. LeftCursorPos := 1;
  257. RightCursorPos := 1;
  258. Result := 1;
  259. if(uLen > 0) then
  260. repeat
  261. case UTF8Str[Result] of
  262. #32,'{','}','/'://Does not change direction, this is a neutral character;
  263. begin
  264. if(pDir = drLTR) then
  265. Inc(RightCursorPos);
  266. end;
  267. #$d8,#$d9://Arabic
  268. begin
  269. pDir := drRTL;
  270. Inc(Result);//Consume control character
  271. end;
  272. else //Latin
  273. begin
  274. pDir := drLTR;
  275. RightCursorPos := LeftCursorPos + 1;
  276. end;
  277. end;
  278. Inc(LeftCursorPos);
  279. Inc(Result);
  280. until(Result > uLen) or
  281. ((pDir = drLTR) and (LeftCursorPos > lp)) or
  282. ((pDir = drRTL) and (RightCursorPos > lp));
  283. //WriteLn('uLen=',uLen,' Result=',Result,' CursorPos=',CursorPos,' LeftCursorPos=',LeftCursorPos,' RightCursorPos=',RightCursorPos);
  284. if(Result > uLen)
  285. then begin
  286. if(lp > LeftCursorPos) then begin
  287. Inc(Result, lp - LeftCursorPos);
  288. LeftCursorPos := lp;
  289. end;
  290. Inc(LeftCursorPos);
  291. if(lp > RightCursorPos) then
  292. if(pDir = drLTR) then
  293. RightCursorPos := lp;
  294. end;
  295. //WriteLn('CursorPos=',CursorPos,' LeftCursorPos=',LeftCursorPos,' RightCursorPos=',RightCursorPos);
  296. Result := Result;
  297. { case dir of
  298. #32:
  299. CursorPos := LeftCursorPos;
  300. #$d8,#$d9:
  301. CursorPos := RightCursorPos;
  302. else
  303. CursorPos := LeftCursorPos;
  304. end;}
  305. //WriteLn('Result=',Result,' New CursorPos=',CursorPos);
  306. end;
  307. function VPos(UTF8Char:PChar; Len:integer; BytePos:integer):Cardinal;
  308. begin
  309. end;
  310. function VCharOf(UTF8Str:TUTF8String; vp:Cardinal; dir:TDirection):TUTF8Char;
  311. var
  312. CharLen: LongInt;
  313. begin
  314. Result:=LCharOf(UTF8Str,LPos(UTF8Str, vp, dir));
  315. end;
  316. end.