123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379 |
- { Parser and code generator for the GraphemeBreakProperty.
- Copyright (C) 2021 Nikolay Nikolov <[email protected]>
- This source is free software; you can redistribute it and/or modify it under
- the terms of the GNU General Public License as published by the Free
- Software Foundation; either version 2 of the License, or (at your option)
- any later version.
- This code is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
- details.
- A copy of the GNU General Public License is available on the World Wide Web
- at <http://www.gnu.org/copyleft/gpl.html>. You can also obtain it by writing
- to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
- Boston, MA 02110-1335, USA.
- }
- program gbpparser;
- {$mode objfpc}{$H+}
- uses
- SysUtils, StrUtils;
- type
- TGraphemeBreakProperty = (
- gbpOther,
- gbpPrepend,
- gbpCR,
- gbpLF,
- gbpControl,
- gbpExtend,
- gpbRegional_Indicator,
- gbpSpacingMark,
- gbpL,
- gbpV,
- gbpT,
- gbpLV,
- gbpLVT,
- gbpE_Base,
- gbpE_Modifier,
- gbpZWJ,
- gbpGlue_After_Zwj,
- gbpE_Base_GAZ);
- TRange = record
- RangeLo, RangeHi: UCS4Char;
- end;
- TRanges = array of TRange;
- var
- GraphemeBreakProperties: array [UCS4Char] of TGraphemeBreakProperty;
- GBPStats: array [TGraphemeBreakProperty] of record
- Exists: Boolean;
- Handled: Boolean;
- MinValue: UCS4Char;
- MaxValue: UCS4Char;
- Count: LongInt;
- Ranges: TRanges;
- end;
- function ParseGraphemeBreakProperty(S: string): TGraphemeBreakProperty;
- begin
- S := Trim(S);
- case S of
- 'Prepend':
- Result := gbpPrepend;
- 'CR':
- Result := gbpCR;
- 'LF':
- Result := gbpLF;
- 'Control':
- Result := gbpControl;
- 'Extend':
- Result := gbpExtend;
- 'Regional_Indicator':
- Result := gpbRegional_Indicator;
- 'SpacingMark':
- Result := gbpSpacingMark;
- 'L':
- Result := gbpL;
- 'V':
- Result := gbpV;
- 'T':
- Result := gbpT;
- 'LV':
- Result := gbpLV;
- 'LVT':
- Result := gbpLVT;
- 'E_Base':
- Result := gbpE_Base;
- 'E_Modifier':
- Result := gbpE_Modifier;
- 'ZWJ':
- Result := gbpZWJ;
- 'Glue_After_Zwj':
- Result := gbpGlue_After_Zwj;
- 'E_Base_GAZ':
- Result := gbpE_Base_GAZ;
- else
- raise EArgumentException('Unknown grapheme break property: ''' + S + '''');
- end;
- end;
- procedure ParseRange(S: string; out RangeLo, RangeHi: UCS4Char);
- var
- dp: SizeInt;
- begin
- S := Trim(S);
- dp := Pos('..', S);
- if dp > 0 then
- begin
- RangeLo := StrToInt('$' + LeftStr(S, dp - 1));
- RangeHi := StrToInt('$' + Copy(S, dp + 2, Length(S) - dp + 3));
- end
- else
- begin
- RangeLo := StrToInt('$' + S);
- RangeHi := RangeLo;
- end;
- end;
- procedure ParseGraphemeBreakProperties(const FileName: string);
- var
- InF: TextFile;
- S: string;
- SplitS: TStringArray;
- LineNr: Integer = 0;
- gbp: TGraphemeBreakProperty;
- RangeLo, RangeHi, R: UCS4Char;
- begin
- if not FileExists(FileName) then
- begin
- Writeln('File doesn''t exist: ', FileName);
- Halt(1);
- end;
- AssignFile(InF, FileName);
- Reset(InF);
- while not EoF(InF) do
- begin
- Inc(LineNr);
- Readln(InF, S);
- S := Trim(S);
- if Pos('#', S) > 0 then
- S := LeftStr(S, Pos('#', S) - 1);
- if S <> '' then
- begin
- SplitS := S.Split([';']);
- if Length(SplitS) <> 2 then
- raise Exception.Create('Invalid number of ; separators on line ' + IntToStr(LineNr));
- ParseRange(SplitS[0], RangeLo, RangeHi);
- gbp := ParseGraphemeBreakProperty(SplitS[1]);
- for R := RangeLo to RangeHi do
- GraphemeBreakProperties[R] := gbp;
- end;
- end;
- CloseFile(InF);
- end;
- procedure CalcStatsAndRanges;
- var
- Ch: UCS4Char;
- gbp, prev_gbp: TGraphemeBreakProperty;
- begin
- FillChar(GBPStats, SizeOf(GBPStats), 0);
- gbp := Low(TGraphemeBreakProperty);
- for Ch := Low(UCS4Char) to High(UCS4Char) do
- begin
- prev_gbp := gbp;
- gbp := GraphemeBreakProperties[Ch];
- with GBPStats[gbp] do
- begin
- if not Exists then
- begin
- Exists := True;
- MinValue := Ch;
- MaxValue := Ch;
- Count := 1;
- SetLength(Ranges, 1);
- Ranges[0].RangeLo := Ch;
- Ranges[0].RangeHi := Ch;
- end
- else
- begin
- MaxValue := Ch;
- Inc(Count);
- if prev_gbp <> gbp then
- begin
- SetLength(Ranges, Length(Ranges) + 1);
- with Ranges[High(Ranges)] do
- begin
- RangeLo := Ch;
- RangeHi := Ch;
- end;
- end
- else
- Ranges[High(Ranges)].RangeHi := Ch;
- end;
- end;
- end;
- end;
- procedure MaybeCoalesceRanges(RLo, RHi: UCS4Char);
- var
- gbp: TGraphemeBreakProperty;
- RI: Integer;
- begin
- for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
- if GBPStats[gbp].Exists and (not GBPStats[gbp].Handled) then
- begin
- for RI := 0 to High(GBPStats[gbp].Ranges) - 1 do
- if (GBPStats[gbp].Ranges[RI].RangeHi = (RLo - 1)) and
- (GBPStats[gbp].Ranges[RI + 1].RangeLo = (RHi + 1)) then
- begin
- GBPStats[gbp].Ranges[RI].RangeHi := GBPStats[gbp].Ranges[RI + 1].RangeHi;
- Delete(GBPStats[gbp].Ranges, RI + 1, 1);
- exit;
- end;
- end;
- end;
- function FindMinRangeCount: Integer;
- var
- gbp: TGraphemeBreakProperty;
- begin
- Result := High(Integer);
- for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
- if GBPStats[gbp].Exists and (not GBPStats[gbp].Handled) and (Length(GBPStats[gbp].Ranges) < Result) then
- Result := Length(GBPStats[gbp].Ranges);
- end;
- function ApplyLV_LVTCompression: Boolean;
- const
- RangeLo = 44032;
- RangeHi = 55203;
- var
- Ch: UCS4Char;
- begin
- Result := False;
- if (GBPStats[gbpLV].MinValue <> RangeLo) or (GBPStats[gbpLV].MaxValue <> (RangeHi - 27)) or
- (GBPStats[gbpLVT].MinValue <> (RangeLo + 1)) or (GBPStats[gbpLVT].MaxValue <> RangeHi) then
- exit;
- for Ch := RangeLo to RangeHi do
- begin
- if ((Ch - RangeLo) mod 28) = 0 then
- begin
- if GraphemeBreakProperties[Ch] <> gbpLV then
- exit;
- end
- else
- begin
- if GraphemeBreakProperties[Ch] <> gbpLVT then
- exit;
- end;
- end;
- Result := True;
- end;
- procedure GenCode(const OutFileName: string);
- const
- RangeCountThreshold = 30{400};
- var
- gbp: TGraphemeBreakProperty;
- RI, NextRangeCount: Integer;
- OutFile: TextFile;
- begin
- Writeln('Generating file: ', OutFileName);
- AssignFile(OutFile, OutFileName);
- Rewrite(OutFile);
- Writeln(OutFile, '{ do not edit, this file is autogenerated by the gbpparser tool }');
- { unused properties are already handled }
- for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
- if not GBPStats[gbp].Exists then
- GBPStats[gbp].Handled := True;
- { handle single codepoints first }
- for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
- if (not GBPStats[gbp].Handled) and (GBPStats[gbp].Count = 1) then
- begin
- if GBPStats[gbp].MinValue <> GBPStats[gbp].MaxValue then
- raise Exception.Create('Internal error');
- Writeln(OutFile, 'if Ch=', GBPStats[gbp].MinValue, 'then result:=',gbp,' else');
- GBPStats[gbp].Handled := True;
- MaybeCoalesceRanges(GBPStats[gbp].MinValue, GBPStats[gbp].MaxValue);
- end;
- { handle single range codepoints next }
- while FindMinRangeCount = 1 do
- for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
- if (not GBPStats[gbp].Handled) and (Length(GBPStats[gbp].Ranges) = 1) then
- begin
- Writeln(OutFile, 'if(Ch>=', GBPStats[gbp].MinValue, ')and(Ch<=', GBPStats[gbp].MaxValue, ')then result:=',gbp,' else');
- GBPStats[gbp].Handled := True;
- MaybeCoalesceRanges(GBPStats[gbp].MinValue, GBPStats[gbp].MaxValue);
- end;
- if ApplyLV_LVTCompression then
- begin
- Writeln(OutFile, 'if(Ch>=44032)and(Ch<=55203)then begin if((Ch-44032)mod 28)=0then result:=gbpLV else result:=gbpLVT end else');
- GBPStats[gbpLV].Handled := True;
- GBPStats[gbpLVT].Handled := True;
- end;
- repeat
- NextRangeCount := FindMinRangeCount;
- if NextRangeCount <= RangeCountThreshold then
- for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
- begin
- if not GBPStats[gbp].Handled and (Length(GBPStats[gbp].Ranges) <= NextRangeCount) then
- begin
- GBPStats[gbp].Handled := True;
- Write(OutFile, 'if');
- for RI := 0 to High(GBPStats[gbp].Ranges) do
- begin
- if RI <> 0 then
- Writeln(OutFile, 'or');
- with GBPStats[gbp].Ranges[RI] do
- begin
- if RangeLo = RangeHi then
- Write(OutFile, '(Ch=', RangeLo, ')')
- else
- Write(OutFile, '((Ch>=', RangeLo, ')and(Ch<=', RangeHi, '))');
- MaybeCoalesceRanges(RangeLo, RangeHi);
- end;
- end;
- Writeln(OutFile, 'then result:=',gbp,' else');
- end;
- end;
- until NextRangeCount > RangeCountThreshold;
- if NextRangeCount <> High(Integer) then
- begin
- //for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
- // if not GBPStats[gbp].Handled then
- // Writeln(gbp, ' ', GBPStats[gbp].MinValue, '..', GBPStats[gbp].MaxValue, ' ', GBPStats[gbp].Count, ' ', Length(GBPStats[gbp].Ranges), ' ', (GBPStats[gbp].MaxValue - GBPStats[gbp].MinValue + 7) div 8);
- Writeln(OutFile, 'case Ch of');
- for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
- begin
- if not GBPStats[gbp].Handled then
- begin
- GBPStats[gbp].Handled := True;
- for RI := 0 to High(GBPStats[gbp].Ranges) do
- begin
- if RI <> 0 then
- Writeln(OutFile, ',');
- with GBPStats[gbp].Ranges[RI] do
- begin
- if RangeLo = RangeHi then
- Write(OutFile, RangeLo)
- else
- Write(OutFile, RangeLo, '..', RangeHi);
- end;
- end;
- Writeln(OutFile, ':result:=', gbp, ';');
- end;
- end;
- Writeln(OutFile, 'else result:=gbpOther end');
- end
- else
- Writeln(OutFile, 'result:=gbpOther');
- CloseFile(OutFile);
- end;
- begin
- FillChar(GraphemeBreakProperties, SizeOf(GraphemeBreakProperties), 0);
- ParseGraphemeBreakProperties('data/UCD/auxiliary/GraphemeBreakProperty.txt');
- CalcStatsAndRanges;
- GenCode('graphemebreakproperty_code.inc');
- Writeln('Done');
- end.
|