Browse Source

+ added tool for parsing GraphemeBreakProperty.txt and converting it to code

git-svn-id: trunk@48720 -
nickysn 4 years ago
parent
commit
9bc0f62f45
3 changed files with 439 additions and 0 deletions
  1. 2 0
      .gitattributes
  2. 58 0
      utils/unicode/gbpparser.lpi
  3. 379 0
      utils/unicode/gbpparser.lpr

+ 2 - 0
.gitattributes

@@ -19772,6 +19772,8 @@ utils/unicode/cldrtxt.pas svneol=native#text/plain
 utils/unicode/cldrxml.pas svneol=native#text/pascal
 utils/unicode/data/readme.txt svneol=native#text/plain
 utils/unicode/fpmake.pp svneol=native#text/plain
+utils/unicode/gbpparser.lpi svneol=native#text/plain
+utils/unicode/gbpparser.lpr svneol=native#text/pascal
 utils/unicode/grbtree.pas svneol=native#text/pascal
 utils/unicode/helper.pas svneol=native#text/pascal
 utils/unicode/parse-collations.bat svneol=native#text/plain

+ 58 - 0
utils/unicode/gbpparser.lpi

@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<CONFIG>
+  <ProjectOptions>
+    <Version Value="11"/>
+    <General>
+      <Flags>
+        <MainUnitHasCreateFormStatements Value="False"/>
+        <MainUnitHasTitleStatement Value="False"/>
+        <MainUnitHasScaledStatement Value="False"/>
+      </Flags>
+      <SessionStorage Value="InProjectDir"/>
+      <MainUnit Value="0"/>
+      <Title Value="gbpparser"/>
+      <UseAppBundle Value="False"/>
+      <ResourceType Value="res"/>
+    </General>
+    <BuildModes Count="1">
+      <Item1 Name="Default" Default="True"/>
+    </BuildModes>
+    <PublishOptions>
+      <Version Value="2"/>
+      <UseFileFilters Value="True"/>
+    </PublishOptions>
+    <RunParams>
+      <FormatVersion Value="2"/>
+      <Modes Count="0"/>
+    </RunParams>
+    <Units Count="1">
+      <Unit0>
+        <Filename Value="gbpparser.lpr"/>
+        <IsPartOfProject Value="True"/>
+      </Unit0>
+    </Units>
+  </ProjectOptions>
+  <CompilerOptions>
+    <Version Value="11"/>
+    <Target>
+      <Filename Value="gbpparser"/>
+    </Target>
+    <SearchPaths>
+      <IncludeFiles Value="$(ProjOutDir)"/>
+      <UnitOutputDirectory Value="lib/$(TargetCPU)-$(TargetOS)"/>
+    </SearchPaths>
+  </CompilerOptions>
+  <Debugging>
+    <Exceptions Count="3">
+      <Item1>
+        <Name Value="EAbort"/>
+      </Item1>
+      <Item2>
+        <Name Value="ECodetoolError"/>
+      </Item2>
+      <Item3>
+        <Name Value="EFOpenError"/>
+      </Item3>
+    </Exceptions>
+  </Debugging>
+</CONFIG>

+ 379 - 0
utils/unicode/gbpparser.lpr

@@ -0,0 +1,379 @@
+{ Parser and code generator for the GraphemeBreakProperty.
+
+  Copyright (C) 2021 Nikolay Nikolov <[email protected]>
+
+  This source is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free
+  Software Foundation; either version 2 of the License, or (at your option)
+  any later version.
+
+  This code is distributed in the hope that it will be useful, but WITHOUT ANY
+  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  A copy of the GNU General Public License is available on the World Wide Web
+  at <http://www.gnu.org/copyleft/gpl.html>. You can also obtain it by writing
+  to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+  Boston, MA 02110-1335, USA.
+}
+
+
+program gbpparser;
+
+{$mode objfpc}{$H+}
+
+uses
+  SysUtils, StrUtils;
+
+type
+  TGraphemeBreakProperty = (
+    gbpOther,
+    gbpPrepend,
+    gbpCR,
+    gbpLF,
+    gbpControl,
+    gbpExtend,
+    gpbRegional_Indicator,
+    gbpSpacingMark,
+    gbpL,
+    gbpV,
+    gbpT,
+    gbpLV,
+    gbpLVT,
+    gbpE_Base,
+    gbpE_Modifier,
+    gbpZWJ,
+    gbpGlue_After_Zwj,
+    gbpE_Base_GAZ);
+
+  TRange = record
+    RangeLo, RangeHi: UCS4Char;
+  end;
+  TRanges = array of TRange;
+
+var
+  GraphemeBreakProperties: array [UCS4Char] of TGraphemeBreakProperty;
+  GBPStats: array [TGraphemeBreakProperty] of record
+    Exists: Boolean;
+    Handled: Boolean;
+    MinValue: UCS4Char;
+    MaxValue: UCS4Char;
+    Count: LongInt;
+    Ranges: TRanges;
+  end;
+
+function ParseGraphemeBreakProperty(S: string): TGraphemeBreakProperty;
+begin
+  S := Trim(S);
+  case S of
+    'Prepend':
+      Result := gbpPrepend;
+    'CR':
+      Result := gbpCR;
+    'LF':
+      Result := gbpLF;
+    'Control':
+      Result := gbpControl;
+    'Extend':
+      Result := gbpExtend;
+    'Regional_Indicator':
+      Result := gpbRegional_Indicator;
+    'SpacingMark':
+      Result := gbpSpacingMark;
+    'L':
+      Result := gbpL;
+    'V':
+      Result := gbpV;
+    'T':
+      Result := gbpT;
+    'LV':
+      Result := gbpLV;
+    'LVT':
+      Result := gbpLVT;
+    'E_Base':
+      Result := gbpE_Base;
+    'E_Modifier':
+      Result := gbpE_Modifier;
+    'ZWJ':
+      Result := gbpZWJ;
+    'Glue_After_Zwj':
+      Result := gbpGlue_After_Zwj;
+    'E_Base_GAZ':
+      Result := gbpE_Base_GAZ;
+    else
+      raise EArgumentException('Unknown grapheme break property: ''' + S + '''');
+  end;
+end;
+
+procedure ParseRange(S: string; out RangeLo, RangeHi: UCS4Char);
+var
+  dp: SizeInt;
+begin
+  S := Trim(S);
+  dp := Pos('..', S);
+  if dp > 0 then
+  begin
+    RangeLo := StrToInt('$' + LeftStr(S, dp - 1));
+    RangeHi := StrToInt('$' + Copy(S, dp + 2, Length(S) - dp + 3));
+  end
+  else
+  begin
+    RangeLo := StrToInt('$' + S);
+    RangeHi := RangeLo;
+  end;
+end;
+
+procedure ParseGraphemeBreakProperties(const FileName: string);
+var
+  InF: TextFile;
+  S: string;
+  SplitS: TStringArray;
+  LineNr: Integer = 0;
+  gbp: TGraphemeBreakProperty;
+  RangeLo, RangeHi, R: UCS4Char;
+begin
+  if not FileExists(FileName) then
+  begin
+    Writeln('File doesn''t exist: ', FileName);
+    Halt(1);
+  end;
+  AssignFile(InF, FileName);
+  Reset(InF);
+  while not EoF(InF) do
+  begin
+    Inc(LineNr);
+    Readln(InF, S);
+    S := Trim(S);
+    if Pos('#', S) > 0 then
+      S := LeftStr(S, Pos('#', S) - 1);
+    if S <> '' then
+    begin
+      SplitS := S.Split([';']);
+      if Length(SplitS) <> 2 then
+        raise Exception.Create('Invalid number of ; separators on line ' + IntToStr(LineNr));
+      ParseRange(SplitS[0], RangeLo, RangeHi);
+      gbp := ParseGraphemeBreakProperty(SplitS[1]);
+      for R := RangeLo to RangeHi do
+        GraphemeBreakProperties[R] := gbp;
+    end;
+  end;
+  CloseFile(InF);
+end;
+
+procedure CalcStatsAndRanges;
+var
+  Ch: UCS4Char;
+  gbp, prev_gbp: TGraphemeBreakProperty;
+begin
+  FillChar(GBPStats, SizeOf(GBPStats), 0);
+  gbp := Low(TGraphemeBreakProperty);
+  for Ch := Low(UCS4Char) to High(UCS4Char) do
+  begin
+    prev_gbp := gbp;
+    gbp := GraphemeBreakProperties[Ch];
+    with GBPStats[gbp] do
+    begin
+      if not Exists then
+      begin
+        Exists := True;
+        MinValue := Ch;
+        MaxValue := Ch;
+        Count := 1;
+        SetLength(Ranges, 1);
+        Ranges[0].RangeLo := Ch;
+        Ranges[0].RangeHi := Ch;
+      end
+      else
+      begin
+        MaxValue := Ch;
+        Inc(Count);
+        if prev_gbp <> gbp then
+        begin
+          SetLength(Ranges, Length(Ranges) + 1);
+          with Ranges[High(Ranges)] do
+          begin
+            RangeLo := Ch;
+            RangeHi := Ch;
+          end;
+        end
+        else
+          Ranges[High(Ranges)].RangeHi := Ch;
+      end;
+    end;
+  end;
+end;
+
+procedure MaybeCoalesceRanges(RLo, RHi: UCS4Char);
+var
+  gbp: TGraphemeBreakProperty;
+  RI: Integer;
+begin
+  for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+    if GBPStats[gbp].Exists and (not GBPStats[gbp].Handled) then
+    begin
+      for RI := 0 to High(GBPStats[gbp].Ranges) - 1 do
+        if (GBPStats[gbp].Ranges[RI].RangeHi = (RLo - 1)) and
+           (GBPStats[gbp].Ranges[RI + 1].RangeLo = (RHi + 1)) then
+        begin
+          GBPStats[gbp].Ranges[RI].RangeHi := GBPStats[gbp].Ranges[RI + 1].RangeHi;
+          Delete(GBPStats[gbp].Ranges, RI + 1, 1);
+          exit;
+        end;
+    end;
+end;
+
+function FindMinRangeCount: Integer;
+var
+  gbp: TGraphemeBreakProperty;
+begin
+  Result := High(Integer);
+  for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+    if GBPStats[gbp].Exists and (not GBPStats[gbp].Handled) and (Length(GBPStats[gbp].Ranges) < Result) then
+      Result := Length(GBPStats[gbp].Ranges);
+end;
+
+function ApplyLV_LVTCompression: Boolean;
+const
+  RangeLo = 44032;
+  RangeHi = 55203;
+var
+  Ch: UCS4Char;
+begin
+  Result := False;
+  if (GBPStats[gbpLV].MinValue <> RangeLo) or (GBPStats[gbpLV].MaxValue <> (RangeHi - 27)) or
+     (GBPStats[gbpLVT].MinValue <> (RangeLo + 1)) or (GBPStats[gbpLVT].MaxValue <> RangeHi) then
+    exit;
+  for Ch := RangeLo to RangeHi do
+  begin
+    if ((Ch - RangeLo) mod 28) = 0 then
+    begin
+      if GraphemeBreakProperties[Ch] <> gbpLV then
+        exit;
+    end
+    else
+    begin
+      if GraphemeBreakProperties[Ch] <> gbpLVT then
+        exit;
+    end;
+  end;
+  Result := True;
+end;
+
+procedure GenCode(const OutFileName: string);
+const
+  RangeCountThreshold = 30{400};
+var
+  gbp: TGraphemeBreakProperty;
+  RI, NextRangeCount: Integer;
+  OutFile: TextFile;
+begin
+  Writeln('Generating file: ', OutFileName);
+
+  AssignFile(OutFile, OutFileName);
+  Rewrite(OutFile);
+
+  Writeln(OutFile, '{ do not edit, this file is autogenerated by the gbpparser tool }');
+
+  { unused properties are already handled }
+  for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+    if not GBPStats[gbp].Exists then
+      GBPStats[gbp].Handled := True;
+
+  { handle single codepoints first }
+  for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+    if (not GBPStats[gbp].Handled) and (GBPStats[gbp].Count = 1) then
+    begin
+      if GBPStats[gbp].MinValue <> GBPStats[gbp].MaxValue then
+        raise Exception.Create('Internal error');
+      Writeln(OutFile, 'if Ch=', GBPStats[gbp].MinValue, 'then result:=',gbp,' else');
+      GBPStats[gbp].Handled := True;
+      MaybeCoalesceRanges(GBPStats[gbp].MinValue, GBPStats[gbp].MaxValue);
+    end;
+
+  { handle single range codepoints next }
+  while FindMinRangeCount = 1 do
+    for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+      if (not GBPStats[gbp].Handled) and (Length(GBPStats[gbp].Ranges) = 1) then
+      begin
+        Writeln(OutFile, 'if(Ch>=', GBPStats[gbp].MinValue, ')and(Ch<=', GBPStats[gbp].MaxValue, ')then result:=',gbp,' else');
+        GBPStats[gbp].Handled := True;
+        MaybeCoalesceRanges(GBPStats[gbp].MinValue, GBPStats[gbp].MaxValue);
+      end;
+
+  if ApplyLV_LVTCompression then
+  begin
+    Writeln(OutFile, 'if(Ch>=44032)and(Ch<=55203)then begin if((Ch-44032)mod 28)=0then result:=gbpLV else result:=gbpLVT end else');
+    GBPStats[gbpLV].Handled := True;
+    GBPStats[gbpLVT].Handled := True;
+  end;
+
+  repeat
+    NextRangeCount := FindMinRangeCount;
+    if NextRangeCount <= RangeCountThreshold then
+      for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+      begin
+        if not GBPStats[gbp].Handled and (Length(GBPStats[gbp].Ranges) <= NextRangeCount) then
+        begin
+          GBPStats[gbp].Handled := True;
+          Write(OutFile, 'if');
+          for RI := 0 to High(GBPStats[gbp].Ranges) do
+          begin
+            if RI <> 0 then
+              Writeln(OutFile, 'or');
+            with GBPStats[gbp].Ranges[RI] do
+            begin
+              if RangeLo = RangeHi then
+                Write(OutFile, '(Ch=', RangeLo, ')')
+              else
+                Write(OutFile, '((Ch>=', RangeLo, ')and(Ch<=', RangeHi, '))');
+              MaybeCoalesceRanges(RangeLo, RangeHi);
+            end;
+          end;
+          Writeln(OutFile, 'then result:=',gbp,' else');
+        end;
+      end;
+  until NextRangeCount > RangeCountThreshold;
+
+  if NextRangeCount <> High(Integer) then
+  begin
+    //for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+    //  if not GBPStats[gbp].Handled then
+    //    Writeln(gbp, ' ', GBPStats[gbp].MinValue, '..', GBPStats[gbp].MaxValue, ' ', GBPStats[gbp].Count, ' ', Length(GBPStats[gbp].Ranges), ' ', (GBPStats[gbp].MaxValue - GBPStats[gbp].MinValue + 7) div 8);
+    Writeln(OutFile, 'case Ch of');
+    for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+    begin
+      if not GBPStats[gbp].Handled then
+      begin
+        GBPStats[gbp].Handled := True;
+        for RI := 0 to High(GBPStats[gbp].Ranges) do
+        begin
+          if RI <> 0 then
+            Writeln(OutFile, ',');
+          with GBPStats[gbp].Ranges[RI] do
+          begin
+            if RangeLo = RangeHi then
+              Write(OutFile, RangeLo)
+            else
+              Write(OutFile, RangeLo, '..', RangeHi);
+          end;
+        end;
+        Writeln(OutFile, ':result:=', gbp, ';');
+      end;
+    end;
+    Writeln(OutFile, 'else result:=gbpOther end');
+  end
+  else
+    Writeln(OutFile, 'result:=gbpOther');
+
+  CloseFile(OutFile);
+end;
+
+begin
+  FillChar(GraphemeBreakProperties, SizeOf(GraphemeBreakProperties), 0);
+  ParseGraphemeBreakProperties('data/UCD/auxiliary/GraphemeBreakProperty.txt');
+  CalcStatsAndRanges;
+  GenCode('graphemebreakproperty_code.inc');
+  Writeln('Done');
+end.
+