123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472 |
- {
- Author Mazen NEIFER
- Licence LGPL
- }
- unit UTF8BIDI;
- {$mode objfpc}{$H+}
- interface
- uses
- FreeBIDI;
- type
- TUCS32Char = Cardinal;
- TUCS16Char = Word;
- TUTF8Char = String[4];
- TUTF8String = UTF8String;
- {****************************Conversion routines*******************************}
- {Converts an UCS 16/32 bits charcater to UTF8 character}
- function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
- {Converts a wide char UCS 16 bits chcarcter to UTF8 character}
- function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
- {Converts a wide char UCS 16 bits string to UTF8 character}
- function UnicodeToUTF8(const Src:TString):TUTF8String;
- {Converts an UTF8 character to UCS 32 bits character}
- function UTF8ToUCS32(const UTF8Char:TUTF8Char):TUCS32Char;
- {Converts an UTF8 character to UCS 16 bits character}
- function UTF8ToUCS16(const UTF8Char:TUTF8Char):TUCS16Char;
- {Converts an UTF8 string to UCS 16 bits string}
- function UTF8ToUnicode(const Src:TUTF8String):TString;
- {Converts an UTF8 string to a double byte string}
- function UTF8ToDoubleByteString(const UTF8Str:TUTF8String):String;
- function UTF8ToDoubleByte(UTF8Str:PChar; Len:Cardinal; DBStr:PByte):Cardinal;
- {****************************Logical aspects***********************************}
- {Returns the number of logical characters}
- function LLength(const UTF8Str:TUTF8String):Cardinal;
- {Converts visual position to logical position}
- function LPos(const UTF8Str:TUTF8String; vp:Integer; pDir:TDirection):Cardinal;
- {Returns character at a given logical position according to paragraph direction}
- function LCharOf(UTF8String:TUTF8String; lp:Integer):TUTF8Char;
- {****************************Visual aspects************************************}
- {Returns the number of visual characters}
- function VLength(const Src:TUTF8String; pDir:TDirection):Cardinal;
- {Converts a logical position to a visual position}
- function VPos(const UTF8Str:TUTF8String; lp:Integer; pDir, cDir:TDirection):Cardinal;
- {Returns character at a given visual position according to paragraph direction}
- function VCharOf(UTF8Str:TUTF8String; vp:Integer; dir:TDirection):TUTF8Char;
- {Inserts a string into an other paying attention of RTL/LTR direction}
- procedure VInsert(const Src:TUTF8String; var Dest:TUTF8String; vp:Integer; pDir:TDirection);
- {Deletes a string into an other paying attention of RTL/LTR direction}
- procedure VDelete(var str:TUTF8String; vp, len:Integer; pDir:TDirection);
- {****************************Helper routines***********************************}
- {Returns direction of a character}
- function DirectionOf(Character:TUTF8Char):TDirection;
- {Returns contextual direction of caracter in a string}
- function DirectionOf(UTF8String:TUTF8String; lp:Integer; pDir:TDirection):TDirection;
- {Inserts a char as if it was typed using keyboard in the most user friendly way.
- Returns the new cursor position after insersion depending on the new visual text}
- function InsertChar(Src:TUTF8Char; var Dest:TUTF8String; vp:Integer; pDir:TDirection):Integer;
- {Returns a table mapping each visual position to its logical position in an UTF8*
- string}
- function VisualToLogical(const UTF8String:TUTF8String; pDir:TDirection):TVisualToLogical;
- implementation
- function DumpStr(const s:TUTF8String):String;
- var
- i:Integer;
- begin
- Result := '';
- for i:= 1 to Length(s) do
- case s[i] of
- #0..#127:
- Result := Result + s[i];
- else
- Result := Result + '$' + HexStr(Ord(s[i]),2);
- end;
- end;
- function ComputeCharLength(p:PChar):Cardinal;
- begin
- if ord(p^)<%11000000
- then
- {regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
- Result:=1
- else if ((ord(p^) and %11100000) = %11000000)
- then
- if (ord(p[1]) and %11000000) = %10000000 then
- Result:=2
- else
- Result:=1
- else if ((ord(p^) and %11110000) = %11100000)
- then
- if ((ord(p[1]) and %11000000) = %10000000)
- and ((ord(p[2]) and %11000000) = %10000000)
- then
- Result:=3
- else
- Result:=1
- else if ((ord(p^) and %11111000) = %11110000)
- then
- if ((ord(p[1]) and %11000000) = %10000000)
- and ((ord(p[2]) and %11000000) = %10000000)
- and ((ord(p[3]) and %11000000) = %10000000)
- then
- Result:=4
- else
- Result:=1
- else
- Result:=1
- end;
- {****************************Conversion routines*******************************}
- function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
- begin
- case aChar of
- 0..$7f:
- begin
- Result[1]:=char(aChar);
- SetLength(UnicodeToUTF8,1);
- end;
- $80..$7ff:
- begin
- Result[1]:=char($c0 or (aChar shr 6));
- Result[2]:=char($80 or (aChar and $3f));
- SetLength(UnicodeToUTF8,2);
- end;
- $800..$ffff:
- begin
- SetLength(Result,3);
- Result[1]:=char($e0 or (aChar shr 12));
- Result[2]:=char($80 or ((aChar shr 6) and $3f));
- Result[3]:=char($80 or (aChar and $3f));
- end;
- $10000..$1fffff:
- begin
- SetLength(UnicodeToUTF8,4);
- Result[1]:=char($f0 or (aChar shr 18));
- Result[2]:=char($80 or ((aChar shr 12) and $3f));
- Result[3]:=char($80 or ((aChar shr 6) and $3f));
- Result[4]:=char($80 or (aChar and $3f));
- end;
- else
- SetLength(UnicodeToUTF8, 0);
- end;
- end;
- function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
- var
- c:TUCS16Char absolute aChar;
- begin
- case c of
- 0..$7f:
- begin
- Result[1]:=char(c);
- SetLength(UnicodeToUTF8,1);
- end;
- $80..$7ff:
- begin
- Result[1]:=char($c0 or (c shr 6));
- Result[2]:=char($80 or (c and $3f));
- SetLength(UnicodeToUTF8,2);
- end;
- else
- SetLength(UnicodeToUTF8, 0);
- end;
- end;
- function UnicodeToUTF8(const Src:TString):TUTF8String;
- var
- vp:Integer;
- begin
- vp := 1;
- Result := '';
- for vp := 1 to Length(Src) do
- Result += UnicodeToUTF8(Src[vp]);
- end;
- function UTF8ToUCS32(const UTF8Char:TUTF8Char):TUCS32Char;
- begin
- case ComputeCharLength(@UTF8Char[1]) of
- 1:{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
- Result := ord(UTF8Char[1]);
- 2:
- Result := ((ord(UTF8Char[1]) and %00011111) shl 6)
- or (ord(UTF8Char[2]) and %00111111);
- 3:
- Result := ((ord(UTF8Char[1]) and %00011111) shl 12)
- or ((ord(UTF8Char[1]) and %00111111) shl 6)
- or (ord(UTF8Char[2]) and %00111111);
- 4:
- Result := ((ord(UTF8Char[1]) and %00011111) shl 18)
- or ((ord(UTF8Char[2]) and %00111111) shl 12)
- or ((ord(UTF8Char[3]) and %00111111) shl 6)
- or (ord(UTF8Char[4]) and %00111111);
- else
- Result := $FFFFFFFF;
- end
- end;
- function UTF8ToUCS16(const UTF8Char:TUTF8Char):TUCS16Char;
- begin
- case Length(UTF8Char) of
- 1:{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
- Result := ord(UTF8Char[1]);
- 2:
- Result := ((ord(UTF8Char[1]) and %00011111) shl 6)
- or (ord(UTF8Char[2]) and %00111111);
- else
- Result := $FFFF;
- end;
- end;
- function UTF8ToUnicode(const Src:TUTF8String):TString;
- var
- lp, vp:Integer;
- c:TUTF8Char;
- begin
- lp := 1;
- vp := 0;
- SetLength(Result, Length(Src));
- while lp <= Length(Src) do
- begin
- vp += 1;
- c := LCharOf(Src, lp);
- Result[vp] := WideChar(UTF8ToUCS16(c));
- lp += Length(c);
- end;
- SetLength(Result, vp);
- end;
- function UTF8ToDoubleByteString(const UTF8Str: TUTF8String): string;
- var
- Len: Integer;
- begin
- Len:=VLength(UTF8Str, drLTR);
- SetLength(Result,Len*2);
- if Len=0 then exit;
- UTF8ToDoubleByte(PChar(UTF8Str),length(UTF8Str),PByte(Result));
- end;
- function UTF8ToDoubleByte(UTF8Str: PChar; Len:Cardinal; DBStr: PByte):Cardinal;
- var
- SrcPos: PChar;
- CharLen: LongInt;
- DestPos: PByte;
- u: Cardinal;
- begin
- SrcPos:=UTF8Str;
- DestPos:=DBStr;
- Result:=0;
- while Len>0 do begin
- u:=UTF8ToUCS32(SrcPos);
- DestPos^:=byte((u shr 8) and $ff);
- inc(DestPos);
- DestPos^:=byte(u and $ff);
- inc(DestPos);
- inc(SrcPos,CharLen);
- dec(Len,CharLen);
- inc(Result);
- end;
- end;
- {****************************Logical aspects***********************************}
- function LLength(const UTF8Str:TUTF8String):Cardinal;
- begin
- Result := Length(UTF8Str);
- end;
- function LPos(const UTF8Str:TUTF8String; vp:Integer; pDir:TDirection):Cardinal;
- var
- v2l:TVisualToLogical;
- i:integer;
- begin
- v2l := VisualToLogical(UTF8Str, pDir);
- if vp <= v2l[0]
- then
- Result := v2l[vp]
- else
- Result := Length(UTF8Str) + 1;
- end;
- function LCharOf(UTF8String:TUTF8String; lp:Integer):TUTF8Char;
- begin
- if lp > Length(UTF8String)
- then
- Exit('');
- while(lp > 0) and ((Ord(UTF8String[lp]) and $F0) in [$80..$B0]) do
- begin
- Dec(lp);
- end;
- if lp = 0
- then
- Exit('');
- Move(UTF8String[lp], Result[1], SizeOf(TUTF8Char) - 1);
- SetLength(Result, ComputeCharLength(@Result[1]));
- end;
- {****************************Visual aspects************************************}
- function VLength(const Src:TUTF8String; pDir:TDirection):Cardinal;
- begin
- Result := FreeBIDI.VLength(UTF8ToUnicode(Src), pDir);
- end;
- function VPos(const UTF8Str:TUTF8String; lp:Integer; pDir, cDir:TDirection):Cardinal;
- var
- v2l:TVisualToLogical;
- vp:Integer;
- begin
- v2l := VisualToLogical(UTF8Str, pDir);
- for vp := 1 to v2l[0] do
- if lp = v2l[vp]
- then
- begin
- Exit(vp);
- end;
- Result := v2l[0];
- end;
- function VPos(UTF8Char:PChar; Len:integer; BytePos:integer):Cardinal;
- begin
- end;
- function VCharOf(UTF8Str:TUTF8String; vp:Integer; dir:TDirection):TUTF8Char;
- var
- CharLen: LongInt;
- begin
- Result:=LCharOf(UTF8Str,LPos(UTF8Str, vp, dir));
- end;
- {****************************Helper routines***********************************}
- function DirectionOf(Character:TUTF8Char):TDirection;
- begin
- case Character[1] of
- #9,#32,
- '/',
- '{','}',
- '[',']',
- '(',')':
- Result := drNONE;
- #$D8,#$D9:
- Result := drRTL;
- else
- Result := drLTR;
- end;
- end;
- function DirectionOf(UTF8String:TUTF8String; lp:Integer; pDir:TDirection):TDirection;
- var
- c:TUTF8Char;
- lDir,rDir:TDirection;
- p:Integer;
- begin
- if(lp <= 0)
- then
- lp := 1;
- {Seek for proper character direction}
- c := LCharOf(UTF8String, lp);
- lDir := DirectionOf(c);
- {Seek for left character direction if it is neutral}
- p := lp;
- while(p > 1) and (lDir = drNONE)do
- begin
- c := LCharOf(UTF8String, p - 1);
- lDir := DirectionOf(c);
- p := p - Length(c);
- end;
- {Seek for right character direction if it is neutral}
- p := lp;
- repeat
- c := LCharOf(UTF8String, p);
- rDir := DirectionOf(c);
- p := p + Length(c);
- until(p > Length(UTF8String)) or (rDir <> drNONE);
- if(lDir = rDir)
- then
- Result := rDir
- else
- Result := pDir;
- end;
- function VisualToLogical(const UTF8String:TUTF8String; pDir:TDirection):TVisualToLogical;
- procedure Insert(value:Byte; var v2l:TVisualToLogical; InsPos:Byte);
- var
- l:Byte;
- begin
- if v2l[0] < 255
- then
- Inc(InsPos);
- if InsPos > v2l[0]
- then
- InsPos := v2l[0];
- for l := v2l[0] downto InsPos do
- v2l[l] := v2l[l-1];
- v2l[InsPos] := Value;
- end;
- var
- lp, vp : Integer;
- cDir,lDir:TDirection;
- Character:TUTF8Char;
- i:Integer;
- begin
- Result[0] := 0;
- lp := 1;
- vp := 1;
- lDir := drNONE;
- while lp <= Length(UTF8String) do
- begin
- Character := LCharOf(UTF8String, lp);
- cDir := DirectionOf(UTF8String, lp, pDir);
- Inc(Result[0]);
- case cDir of
- drRTL:
- begin
- lDir := drRTL;
- end;
- drLTR:
- begin
- lDir := drLTR;
- vp := Result[0];
- end;
- else
- vp := Result[0];
- end;
- Insert(lp, Result, vp);
- Inc(lp, Length(Character));
- end;
- end;
- function InsertChar(Src:TUTF8Char; var Dest:TUTF8String; vp:Integer; pDir:TDirection):Integer;
- var
- temp:TString;
- c:TCharacter;
- begin
- temp := UTF8ToUnicode(Dest);
- c := WideChar(UTF8ToUCS16(Src));
- Result := FreeBIDI.InsertChar(c, temp, vp, pDir);
- Dest := UnicodeToUTF8(temp);
- end;
- procedure VInsert(const Src:TUTF8String;var Dest:TUTF8String; vp:Integer; pDir:TDirection);
- function VStr(const Src:TUTF8String; pDir:TDirection):TUTF8String;
- var
- v2lSrc:TVisualToLogical;
- i:Integer;
- begin
- v2lSrc := VisualToLogical(Src,pDir);
- Result := '';
- for i := 1 to v2lSrc[0] do
- Result := Result + LCharOf(Src,v2lSrc[i]);
- end;
- var
- vSrc,vDest:TUTF8String;
- begin
- vSrc := VStr(Src,pDir);
- vDest := VStr(Dest,pDir);
- Insert(vSrc, vDest, vp);
- Dest := VStr(vDest, pDir);
- end;
- procedure VDelete(var str:TUTF8String; vp, len:Integer; pDir:TDirection);
- var
- temp:TString;
- begin
- temp := UTF8ToUnicode(str);
- FreeBIDI.VDelete(temp, vp, len, pDir);
- str := UnicodeToUTF8(temp);
- end;
- end.
|