pascal
/
freepascal.compiler
mirror of https://gitlab.com/freepascal.org/fpc/source.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
							{
Author Mazen NEIFER
Licence LGPL
}
unit UTF8BIDI;

{$mode objfpc}{$H+}

interface

type
  TUCS32Char = Cardinal;
  TUCS16Char = Word;
  TUTF8Char = String[4];
  TUTF8String = UTF8String;
  TDirection=(
    drNONE,
    drRTL,
    drLTR
  );

{****************************Conversion routines*******************************}
{Converts an UCS 16/32 bits charcater to UTF8 character}
function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
{Converts a wide char UCS 16 bits chcarcter to UTF8 character}
function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
{Converts an UTF8 character to UCS 32 bits character}
function UTF8ToUnicode(const UTF8Char:TUTF8Char):TUCS32Char;
{Converts an UTF8 string to a double byte string}
function UTF8ToDoubleByteString(const UTF8Str:TUTF8String):String;
function UTF8ToDoubleByte(UTF8Str:PChar; Len:Cardinal; DBStr:PByte):Cardinal;
{****************************Logical aspects***********************************}
{Returns the number of logical characters}
function LLength(const UTF8Str:TUTF8String):Cardinal;
{Converts visual position to logical position}
function LPos(const UTF8Str:TUTF8String; vp:Cardinal; pDir:TDirection):Cardinal;
{Returns character at a given logical position according to paragraph direction}
function LCharOf(UTF8Str:TUTF8String; lp:Cardinal):TUTF8Char;
{****************************Visual aspects************************************}
{Returns the number of visual characters}
function VLength(const UTF8Str:TUTF8String):Cardinal;
function VLength(p: PChar; Count:Cardinal):Cardinal;
{Converts a logical position to a visual position}
function VPos(const UTF8Str:TUTF8String; lp:Cardinal; pDir, cDir:TDirection):Cardinal;
function VPos(UTF8Char:PChar; Len:integer; BytePos:integer):Cardinal;
{Returns character at a given visual position according to paragraph direction}
function VCharOf(UTF8Str:TUTF8String; vp:Cardinal; dir:TDirection):TUTF8Char;

implementation

function ComputeCharLength(p:PChar):Cardinal;
begin
  if ord(p^)<%11000000
  then
{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
    Result:=1
  else if ((ord(p^) and %11100000) = %11000000)
  then
    if (ord(p[1]) and %11000000) = %10000000 then
      Result:=2
    else
      Result:=1
  else if ((ord(p^) and %11110000) = %11100000)
  then
    if ((ord(p[1]) and %11000000) = %10000000)
      and ((ord(p[2]) and %11000000) = %10000000)
    then
      Result:=3
    else
        Result:=1
  else if ((ord(p^) and %11111000) = %11110000)
  then
    if ((ord(p[1]) and %11000000) = %10000000)
    and ((ord(p[2]) and %11000000) = %10000000)
    and ((ord(p[3]) and %11000000) = %10000000)
    then
      Result:=4
    else
      Result:=1
  else
    Result:=1
end;
{****************************Conversion routines*******************************}
function UnicodeToUTF8(aChar:TUCS32Char):TUTF8Char;
begin
  case aChar of
    0..$7f:
      begin
        Result[1]:=char(aChar);
        SetLength(UnicodeToUTF8,1);
      end;
    $80..$7ff:
      begin
        Result[1]:=char($c0 or (aChar shr 6));
        Result[2]:=char($80 or (aChar and $3f));
        SetLength(UnicodeToUTF8,2);
      end;
    $800..$ffff:
      begin
        SetLength(Result,3);
        Result[1]:=char($e0 or (aChar shr 12));
        Result[2]:=char($80 or ((aChar shr 6) and $3f));
        Result[3]:=char($80 or (aChar and $3f));
      end;
    $10000..$1fffff:
      begin
        SetLength(UnicodeToUTF8,4);
        Result[1]:=char($f0 or (aChar shr 18));
        Result[2]:=char($80 or ((aChar shr 12) and $3f));
        Result[3]:=char($80 or ((aChar shr 6) and $3f));
        Result[4]:=char($80 or (aChar and $3f));
      end;
  else
    SetLength(UnicodeToUTF8, 0);
  end;
end;

function UnicodeToUTF8(aChar:WideChar):TUTF8Char;
begin
  Result := UnicodeToUTF8(Word(aChar));
end;

function UTF8ToUnicode(const UTF8Char:TUTF8Char):TUCS32Char;
begin
  case ComputeCharLength(@UTF8Char[1]) of
    1:{regular single byte character (#0 is a normal char, this is UTF8Charascal ;)}
      Result := ord(UTF8Char[1]);
    2:
      Result := ((ord(UTF8Char[1]) and %00011111) shl 6)
                or (ord(UTF8Char[2]) and %00111111);
    3:
      Result := ((ord(UTF8Char[1]) and %00011111) shl 12)
                or ((ord(UTF8Char[1]) and %00111111) shl 6)
                or (ord(UTF8Char[2]) and %00111111);
    4:
      Result := ((ord(UTF8Char[1]) and %00011111) shl 18)
                or ((ord(UTF8Char[2]) and %00111111) shl 12)
                or ((ord(UTF8Char[3]) and %00111111) shl 6)
                or (ord(UTF8Char[4]) and %00111111);
  else
    Result := $FFFFFFFF;
  end
end;

function UTF8ToDoubleByteString(const UTF8Str: TUTF8String): string;
var
  Len: Integer;
begin
  Len:=VLength(UTF8Str);
  SetLength(Result,Len*2);
  if Len=0 then exit;
  UTF8ToDoubleByte(PChar(UTF8Str),length(UTF8Str),PByte(Result));
end;

function UTF8ToDoubleByte(UTF8Str: PChar; Len:Cardinal; DBStr: PByte):Cardinal;
var
  SrcPos: PChar;
  CharLen: LongInt;
  DestPos: PByte;
  u: Cardinal;
begin
  SrcPos:=UTF8Str;
  DestPos:=DBStr;
  Result:=0;
  while Len>0 do begin
    u:=UTF8ToUnicode(SrcPos);
    DestPos^:=byte((u shr 8) and $ff);
    inc(DestPos);
    DestPos^:=byte(u and $ff);
    inc(DestPos);
    inc(SrcPos,CharLen);
    dec(Len,CharLen);
    inc(Result);
  end;
end;
{****************************Logical aspects***********************************}
function LLength(const UTF8Str:TUTF8String):Cardinal;
begin
  Result := Length(UTF8Str);
end;

function LPos(const UTF8Str:TUTF8String; vp:Cardinal; pDir:TDirection):Cardinal;
var
{At beginning of the line we don't know which direction, thus the first
 character usually decides of paragrph direction}
  LeftCursorPos, RightCursorPos:Integer;
  uLen:Integer;
begin
  uLen := Length(UTF8Str);
  LeftCursorPos := 1;
  RightCursorPos := 1;
  Result := 1;
  if(uLen > 0) then
    repeat
      case UTF8Str[Result] of
        #32,'{','}','/'://Does not change direction, this is a neutral character;
          begin
            if(pDir = drLTR) then
              Inc(RightCursorPos);
          end;
        #$d8,#$d9://Arabic
          begin
            pDir := drRTL;
            Inc(Result);//Consume control character
          end;
      else //Latin
        begin
          pDir := drLTR;
          RightCursorPos := LeftCursorPos + 1;
        end;
      end;
      Inc(LeftCursorPos);
      Inc(Result);
    until(Result > uLen) or
         ((pDir = drLTR) and (LeftCursorPos > vp)) or
         ((pDir = drRTL) and (RightCursorPos > vp));
//WriteLn('uLen=',uLen,' Result=',Result,' CursorPos=',CursorPos,' LeftCursorPos=',LeftCursorPos,' RightCursorPos=',RightCursorPos);
  if(Result > uLen)
  then begin
    if(vp > LeftCursorPos) then begin
      Inc(Result, vp - LeftCursorPos);
      LeftCursorPos := vp;
    end;
    Inc(LeftCursorPos);
    if(vp > RightCursorPos) then
      if(pDir = drLTR) then
        RightCursorPos := vp;
  end;
//WriteLn('CursorPos=',CursorPos,' LeftCursorPos=',LeftCursorPos,' RightCursorPos=',RightCursorPos);
  Result := Result;
end;

function LCharOf(UTF8Str:TUTF8String; lp:Cardinal):TUTF8Char;
begin
  while(lp > 0) and (UTF8Str[lp] > #128) do
    Dec(lp);
  if lp = 0
  then
    Exit('');
  Move(Result, UTF8Str[lp], SizeOf(Result));
  SetLength(Result, ComputeCharLength(@Result[1]));
end;
{****************************Visual aspects************************************}
function VLength(const UTF8Str:TUTF8String):Cardinal;
begin
  Result := VLength(PChar(UTF8Str),LLength(UTF8Str));
end;

function VLength(p:PChar; Count:Cardinal):Cardinal;
var
  CharLen: LongInt;
begin
  VLength:=0;
  while (Count>0) do begin
    inc(Result);
    CharLen:=ComputeCharLength(p);
    inc(p,CharLen);
    dec(Count,CharLen);
  end;
end;

function VPos(const UTF8Str:TUTF8String; lp:Cardinal; pDir, cDir:TDirection):Cardinal;
var
{At beginning of the line we don't know which direction, thus the first
 character usually decides of paragrph direction}
  LeftCursorPos, RightCursorPos:Integer;
  uLen:Integer;
begin
  uLen := Length(UTF8Str);
  LeftCursorPos := 1;
  RightCursorPos := 1;
  Result := 1;
  if(uLen > 0) then
    repeat
      case UTF8Str[Result] of
        #32,'{','}','/'://Does not change direction, this is a neutral character;
          begin
            if(pDir = drLTR) then
              Inc(RightCursorPos);
          end;
        #$d8,#$d9://Arabic
          begin
            pDir := drRTL;
            Inc(Result);//Consume control character
          end;
      else //Latin
        begin
          pDir := drLTR;
          RightCursorPos := LeftCursorPos + 1;
        end;
      end;
      Inc(LeftCursorPos);
      Inc(Result);
    until(Result > uLen) or
         ((pDir = drLTR) and (LeftCursorPos > lp)) or
         ((pDir = drRTL) and (RightCursorPos > lp));
//WriteLn('uLen=',uLen,' Result=',Result,' CursorPos=',CursorPos,' LeftCursorPos=',LeftCursorPos,' RightCursorPos=',RightCursorPos);
  if(Result > uLen)
  then begin
    if(lp > LeftCursorPos) then begin
      Inc(Result, lp - LeftCursorPos);
      LeftCursorPos := lp;
    end;
    Inc(LeftCursorPos);
    if(lp > RightCursorPos) then
      if(pDir = drLTR) then
        RightCursorPos := lp;
  end;
//WriteLn('CursorPos=',CursorPos,' LeftCursorPos=',LeftCursorPos,' RightCursorPos=',RightCursorPos);
  Result := Result;
{  case dir of
    #32:
      CursorPos := LeftCursorPos;
    #$d8,#$d9:
      CursorPos := RightCursorPos;
    else
      CursorPos := LeftCursorPos;
  end;}
//WriteLn('Result=',Result,' New CursorPos=',CursorPos);
end;

function VPos(UTF8Char:PChar; Len:integer; BytePos:integer):Cardinal;
begin
end;


function VCharOf(UTF8Str:TUTF8String; vp:Cardinal; dir:TDirection):TUTF8Char;
var
  CharLen: LongInt;
begin
  Result:=LCharOf(UTF8Str,LPos(UTF8Str, vp, dir));
end;

end.