Quellcode durchsuchen

* synchronized with trunk

git-svn-id: branches/unicodekvm@48756 -
nickysn vor 4 Jahren
Ursprung
Commit
cd8b0036d4

+ 5 - 0
.gitattributes

@@ -9097,6 +9097,8 @@ packages/rtl-unicode/src/inc/cp936.pas svneol=native#text/pascal
 packages/rtl-unicode/src/inc/cp949.pas svneol=native#text/pascal
 packages/rtl-unicode/src/inc/cp950.pas svneol=native#text/pascal
 packages/rtl-unicode/src/inc/cpbuildu.pp svneol=native#text/plain
+packages/rtl-unicode/src/inc/eastasianwidth.pp svneol=native#text/plain
+packages/rtl-unicode/src/inc/eastasianwidth_code.inc svneol=native#text/plain
 packages/rtl-unicode/src/inc/freebidi.pp svneol=native#text/plain
 packages/rtl-unicode/src/inc/graphemebreakproperty.pp svneol=native#text/plain
 packages/rtl-unicode/src/inc/graphemebreakproperty_code.inc svneol=native#text/plain
@@ -16212,6 +16214,7 @@ tests/test/units/strings/tstrings1.pp svneol=native#text/plain
 tests/test/units/strutils/taddchar.pp svneol=native#text/plain
 tests/test/units/strutils/taddcharr.pp svneol=native#text/plain
 tests/test/units/strutils/tbintohex.pp svneol=native#text/plain
+tests/test/units/strutils/tboyer.pp svneol=native#text/pascal
 tests/test/units/strutils/tdec2numb.pp svneol=native#text/plain
 tests/test/units/strutils/thex2dec.pp svneol=native#text/plain
 tests/test/units/strutils/thextobin.pp svneol=native#text/plain
@@ -19836,6 +19839,8 @@ utils/unicode/cldrtest.pas svneol=native#text/pascal
 utils/unicode/cldrtxt.pas svneol=native#text/plain
 utils/unicode/cldrxml.pas svneol=native#text/pascal
 utils/unicode/data/readme.txt svneol=native#text/plain
+utils/unicode/eawparser.lpi svneol=native#text/plain
+utils/unicode/eawparser.lpr svneol=native#text/pascal
 utils/unicode/fpmake.pp svneol=native#text/plain
 utils/unicode/gbpparser.lpi svneol=native#text/plain
 utils/unicode/gbpparser.lpr svneol=native#text/pascal

+ 2 - 4
packages/rtl-objpas/src/inc/strutils.pp

@@ -429,8 +429,7 @@ begin
       AddMatch(i+1);
       //Only first match ?
       if not aMatchAll then break;
-      inc(i,OldPatternSize);
-      inc(i,OldPatternSize);
+      inc(i,DeltaJumpTable2[0]);
     end else begin
       i:=i + Max(DeltaJumpTable1[ord(s[i])],DeltaJumpTable2[j]);
     end;
@@ -582,8 +581,7 @@ begin
       AddMatch(i+1);
       //Only first match ?
       if not aMatchAll then break;
-      inc(i,OldPatternSize);
-      inc(i,OldPatternSize);
+      inc(i,DeltaJumpTable2[0]);
     end else begin
       i:=i + Max(DeltaJumpTable1[Ord(lCaseArray[Ord(s[i])])],DeltaJumpTable2[j]);
     end;

+ 9 - 2
packages/rtl-unicode/fpmake.pp

@@ -16,12 +16,13 @@ Const
   CPUnits       = [aix,amiga,aros,android,beos,darwin,iphonesim,ios,emx,gba,nds,freebsd,go32v2,haiku,linux,morphos,netbsd,netware,netwlibc,openbsd,os2,solaris,watcom,wii,win32,win64,wince,dragonfly,freertos];
   utf8bidiOSes  = [netware,netwlibc];
   freebidiOSes  = [netware,netwlibc];
-  GraphemeBreakPropertyOSes = AllOSes;
+  GraphemeBreakPropertyOSes = AllOSes-[embedded,zxspectrum,msxdos,amstradcpc];
+  EastAsianWidthOSes        = AllOSes-[embedded,zxspectrum,msxdos,amstradcpc];
 
 // Character not movable because fpwidestring depends on it.
 //  CharacterOSes = [android,darwin,freebsd,linux,netbsd,openbsd,solaris,win32,win64,dragonfly];
 
-  UnicodeAllOSes =   CollationOSes + utf8bidiOSes + freebidiOSes + CPUnits + GraphemeBreakPropertyOSes;
+  UnicodeAllOSes =   CollationOSes + utf8bidiOSes + freebidiOSes + CPUnits + GraphemeBreakPropertyOSes + EastAsianWidthOSes;
 
 // Amiga has a crt in its RTL dir, but it is commented in the makefile
 
@@ -145,6 +146,12 @@ begin
       begin
         AddInclude('graphemebreakproperty_code.inc');
       end;
+
+    T:=P.Targets.AddUnit('eastasianwidth.pp',EastAsianWidthOSes);
+    with T.Dependencies do
+      begin
+        AddInclude('eastasianwidth_code.inc');
+      end;
   end
 end;
 

+ 55 - 0
packages/rtl-unicode/src/inc/eastasianwidth.pp

@@ -0,0 +1,55 @@
+{ EastAsianWidth Unicode data unit.
+
+  Copyright (C) 2021 Nikolay Nikolov <[email protected]>
+
+  This library is free software; you can redistribute it and/or modify it
+  under the terms of the GNU Library General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or (at your
+  option) any later version with the following modification:
+
+  As a special exception, the copyright holders of this library give you
+  permission to link this library with independent modules to produce an
+  executable, regardless of the license terms of these independent modules,and
+  to copy and distribute the resulting executable under terms of your choice,
+  provided that you also meet, for each linked independent module, the terms
+  and conditions of the license of that module. An independent module is a
+  module which is not derived from or based on this library. If you modify
+  this library, you may extend this exception to your version of the library,
+  but you are not obligated to do so. If you do not wish to do so, delete this
+  exception statement from your version.
+
+  This program is distributed in the hope that it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
+  for more details.
+
+  You should have received a copy of the GNU Library General Public License
+  along with this library; if not, write to the Free Software Foundation,
+  Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1335, USA.
+}
+
+unit eastasianwidth;
+
+{$MODE objfpc}
+
+interface
+
+type
+  TEastAsianWidth = (
+    eawN,
+    eawA,
+    eawF,
+    eawH,
+    eawNa,
+    eawW);
+
+function GetEastAsianWidth(Ch: UCS4Char): TEastAsianWidth;
+
+implementation
+
+function GetEastAsianWidth(Ch: UCS4Char): TEastAsianWidth;
+begin
+  {$I eastasianwidth_code.inc}
+end;
+
+end.

+ 300 - 0
packages/rtl-unicode/src/inc/eastasianwidth_code.inc

@@ -0,0 +1,300 @@
+{ do not edit, this file is autogenerated by the eawparser tool }
+if(Ch=12288)or
+((Ch>=65281)and(Ch<=65376))or
+((Ch>=65504)and(Ch<=65510))then result:=eawF else
+if(Ch=8361)or
+((Ch>=65377)and(Ch<=65470))or
+((Ch>=65474)and(Ch<=65479))or
+((Ch>=65482)and(Ch<=65487))or
+((Ch>=65490)and(Ch<=65495))or
+((Ch>=65498)and(Ch<=65500))or
+((Ch>=65512)and(Ch<=65518))then result:=eawH else
+if((Ch>=32)and(Ch<=126))or
+((Ch>=162)and(Ch<=163))or
+((Ch>=165)and(Ch<=166))or
+(Ch=172)or
+(Ch=175)or
+((Ch>=10214)and(Ch<=10221))or
+((Ch>=10629)and(Ch<=10630))then result:=eawNa else
+case Ch of
+161..168,
+170,
+173..180,
+182..186,
+188..191,
+198,
+208,
+215..216,
+222..225,
+230,
+232..234,
+236..237,
+240,
+242..243,
+247..250,
+252,
+254,
+257,
+273,
+275,
+283,
+294..295,
+299,
+305..307,
+312,
+319..322,
+324,
+328..331,
+333,
+338..339,
+358..359,
+363,
+462,
+464,
+466,
+468,
+470,
+472,
+474,
+476,
+593,
+609,
+708,
+711,
+713..715,
+717,
+720,
+728..731,
+733,
+735,
+768..879,
+913..929,
+931..937,
+945..961,
+963..969,
+1025,
+1040..1103,
+1105,
+8208,
+8211..8214,
+8216..8217,
+8220..8221,
+8224..8226,
+8228..8231,
+8240,
+8242..8243,
+8245,
+8251,
+8254,
+8308,
+8319,
+8321..8324,
+8364,
+8451,
+8453,
+8457,
+8467,
+8470,
+8481..8482,
+8486,
+8491,
+8531..8532,
+8539..8542,
+8544..8555,
+8560..8569,
+8585,
+8592..8601,
+8632..8633,
+8658,
+8660,
+8679,
+8704,
+8706..8707,
+8711..8712,
+8715,
+8719,
+8721,
+8725,
+8730,
+8733..8736,
+8739,
+8741,
+8743..8748,
+8750,
+8756..8759,
+8764..8765,
+8776,
+8780,
+8786,
+8800..8801,
+8804..8807,
+8810..8811,
+8814..8815,
+8834..8835,
+8838..8839,
+8853,
+8857,
+8869,
+8895,
+8978,
+9312..9449,
+9451..9547,
+9552..9587,
+9600..9615,
+9618..9621,
+9632..9633,
+9635..9641,
+9650..9651,
+9654..9655,
+9660..9661,
+9664..9665,
+9670..9672,
+9675,
+9678..9681,
+9698..9701,
+9711,
+9733..9734,
+9737,
+9742..9743,
+9756,
+9758,
+9792,
+9794,
+9824..9825,
+9827..9829,
+9831..9834,
+9836..9837,
+9839,
+9886..9887,
+9919,
+9926..9933,
+9935..9939,
+9941..9953,
+9955,
+9960..9961,
+9963..9969,
+9972,
+9974..9977,
+9979..9980,
+9982..9983,
+10045,
+10102..10111,
+11094..11097,
+12872..12879,
+57344..63743,
+65024..65039,
+65533,
+127232..127242,
+127248..127277,
+127280..127337,
+127344..127373,
+127375..127376,
+127387..127404,
+917760..917999,
+983040..1048573,
+1048576..1114109:result:=eawA;
+4352..4447,
+8986..8987,
+9001..9002,
+9193..9196,
+9200,
+9203,
+9725..9726,
+9748..9749,
+9800..9811,
+9855,
+9875,
+9889,
+9898..9899,
+9917..9918,
+9924..9925,
+9934,
+9940,
+9962,
+9970..9971,
+9973,
+9978,
+9981,
+9989,
+9994..9995,
+10024,
+10060,
+10062,
+10067..10069,
+10071,
+10133..10135,
+10160,
+10175,
+11035..11036,
+11088,
+11093,
+11904..11929,
+11931..12019,
+12032..12245,
+12272..12283,
+12289..12350,
+12353..12438,
+12441..12543,
+12549..12589,
+12593..12686,
+12688..12730,
+12736..12771,
+12784..12830,
+12832..12871,
+12880..13054,
+13056..19903,
+19968..42124,
+42128..42182,
+43360..43388,
+44032..55203,
+63744..64255,
+65040..65049,
+65072..65106,
+65108..65126,
+65128..65131,
+94176,
+94208..100332,
+100352..101106,
+110592..110593,
+126980,
+127183,
+127374,
+127377..127386,
+127488..127490,
+127504..127547,
+127552..127560,
+127568..127569,
+127744..127776,
+127789..127797,
+127799..127868,
+127870..127891,
+127904..127946,
+127951..127955,
+127968..127984,
+127988,
+127992..128062,
+128064,
+128066..128252,
+128255..128317,
+128331..128334,
+128336..128359,
+128378,
+128405..128406,
+128420,
+128507..128591,
+128640..128709,
+128716,
+128720..128722,
+128747..128748,
+128756..128758,
+129296..129310,
+129312..129319,
+129328,
+129331..129342,
+129344..129355,
+129360..129374,
+129408..129425,
+129472,
+131072..196605,
+196608..262141:result:=eawW;
+else result:=eawN end

+ 1 - 1
rtl/embedded/Makefile

@@ -374,7 +374,7 @@ CPU_SPECIFIC_COMMON_UNITS=
 ifeq ($(ARCH),arm)
 CPU_SPECIFIC_COMMON_UNITS=sysutils math classes fgl macpas typinfo types rtlconsts getopts lineinfo
 ifeq ($(SUBARCH),armv7m)
-CPU_UNITS=lm3fury lm3tempest stm32f10x_ld stm32f10x_md stm32f10x_hd stm32f10x_xl stm32f10x_conn stm32f10x_cl lpc13xx lpc1768 lm4f120 sam3x8e xmc4500 cortexm3 cortexm4 # thumb2_bare
+CPU_UNITS=lm3fury lm3tempest stm32f10x_ld stm32f10x_md stm32f10x_hd stm32f10x_xl stm32f10x_conn stm32f10x_cl lpc13xx lpc1768 sam3x8e xmc4500 cortexm3 cortexm4 # thumb2_bare
 CPU_UNITS_DEFINED=1
 endif
 ifeq ($(SUBARCH),armv7em)

+ 1 - 1
rtl/embedded/Makefile.fpc

@@ -71,7 +71,7 @@ CPU_SPECIFIC_COMMON_UNITS=
 ifeq ($(ARCH),arm)
 CPU_SPECIFIC_COMMON_UNITS=sysutils math classes fgl macpas typinfo types rtlconsts getopts lineinfo
 ifeq ($(SUBARCH),armv7m)
-CPU_UNITS=lm3fury lm3tempest stm32f10x_ld stm32f10x_md stm32f10x_hd stm32f10x_xl stm32f10x_conn stm32f10x_cl lpc13xx lpc1768 lm4f120 sam3x8e xmc4500 cortexm3 cortexm4 # thumb2_bare
+CPU_UNITS=lm3fury lm3tempest stm32f10x_ld stm32f10x_md stm32f10x_hd stm32f10x_xl stm32f10x_conn stm32f10x_cl lpc13xx lpc1768 sam3x8e xmc4500 cortexm3 cortexm4 # thumb2_bare
 CPU_UNITS_DEFINED=1
 endif
 ifeq ($(SUBARCH),armv7em)

+ 79 - 0
tests/test/units/strutils/tboyer.pp

@@ -0,0 +1,79 @@
+{$mode objfpc}
+
+uses
+  StrUtils;
+const
+  result1 : array of SizeInt = (1, 4, 7, 10, 13, 16);
+var 
+  a : array of SizeInt;
+  i : LongInt;
+begin
+  if FindMatchesBoyerMooreCaseSensitive('abcabcabcabcabcabcab','abcab',a,false) then
+    begin
+      if Length(a)<>1 then
+        halt(2);
+      if a[0]<>result1[0] then
+        halt(3);
+    end
+  else
+    halt(1);
+
+  if FindMatchesBoyerMooreCaseSensitive('abcabcabcabcabcabcab','abcab',a,true) then
+    begin
+      if Length(a)<>Length(result1) then
+        halt(12);
+      for i:=Low(a) to High(a) do
+        if a[i]<>result1[i] then
+          halt(13);
+    end
+  else
+    halt(11);
+
+  if FindMatchesBoyerMooreCaseInSensitive('abcabcabcabcabcabcab','abcab',a,false) then
+    begin
+      if Length(a)<>1 then
+        halt(22);
+      if a[0]<>result1[0] then
+        halt(23);
+    end
+  else
+    halt(21);
+
+{
+  apparently not working yet:
+  
+  if FindMatchesBoyerMooreCaseInSensitive('abcabcabcabcabcabcab','abcab',a,true) then
+    begin
+      if Length(a)<>Length(result1) then
+        halt(32);
+      for i:=Low(a) to High(a) do
+        if a[i]<>result1[i] then
+          halt(33);
+    end
+  else
+    halt(31);
+
+  if FindMatchesBoyerMooreCaseInSensitive('abcabcabcAbcabcAbcab','abcaB',a,false) then
+    begin
+      if Length(a)<>1 then
+        halt(42);
+      if a[0]<>result1[0] then
+        halt(43);
+    end
+  else
+    halt(41);
+
+  if FindMatchesBoyerMooreCaseInSensitive('abcabCabcAbcabcABcab','abcaB',a,true) then
+    begin
+      if Length(a)<>Length(result1) then
+        halt(52);
+      for i:=Low(a) to High(a) do
+        if a[i]<>result1[i] then
+          halt(53);
+    end
+  else
+    halt(51);
+}
+
+  writeln('ok');
+end.

+ 58 - 0
utils/unicode/eawparser.lpi

@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<CONFIG>
+  <ProjectOptions>
+    <Version Value="11"/>
+    <General>
+      <Flags>
+        <MainUnitHasCreateFormStatements Value="False"/>
+        <MainUnitHasTitleStatement Value="False"/>
+        <MainUnitHasScaledStatement Value="False"/>
+      </Flags>
+      <SessionStorage Value="InProjectDir"/>
+      <MainUnit Value="0"/>
+      <Title Value="eawparser"/>
+      <UseAppBundle Value="False"/>
+      <ResourceType Value="res"/>
+    </General>
+    <BuildModes Count="1">
+      <Item1 Name="Default" Default="True"/>
+    </BuildModes>
+    <PublishOptions>
+      <Version Value="2"/>
+      <UseFileFilters Value="True"/>
+    </PublishOptions>
+    <RunParams>
+      <FormatVersion Value="2"/>
+      <Modes Count="0"/>
+    </RunParams>
+    <Units Count="1">
+      <Unit0>
+        <Filename Value="eawparser.lpr"/>
+        <IsPartOfProject Value="True"/>
+      </Unit0>
+    </Units>
+  </ProjectOptions>
+  <CompilerOptions>
+    <Version Value="11"/>
+    <Target>
+      <Filename Value="eawparser"/>
+    </Target>
+    <SearchPaths>
+      <IncludeFiles Value="$(ProjOutDir)"/>
+      <UnitOutputDirectory Value="lib/$(TargetCPU)-$(TargetOS)"/>
+    </SearchPaths>
+  </CompilerOptions>
+  <Debugging>
+    <Exceptions Count="3">
+      <Item1>
+        <Name Value="EAbort"/>
+      </Item1>
+      <Item2>
+        <Name Value="ECodetoolError"/>
+      </Item2>
+      <Item3>
+        <Name Value="EFOpenError"/>
+      </Item3>
+    </Exceptions>
+  </Debugging>
+</CONFIG>

+ 332 - 0
utils/unicode/eawparser.lpr

@@ -0,0 +1,332 @@
+{ Parser and code generator for the EastAsianWidth.
+
+  Copyright (C) 2021 Nikolay Nikolov <[email protected]>
+
+  This source is free software; you can redistribute it and/or modify it under
+  the terms of the GNU General Public License as published by the Free
+  Software Foundation; either version 2 of the License, or (at your option)
+  any later version.
+
+  This code is distributed in the hope that it will be useful, but WITHOUT ANY
+  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+  details.
+
+  A copy of the GNU General Public License is available on the World Wide Web
+  at <http://www.gnu.org/copyleft/gpl.html>. You can also obtain it by writing
+  to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+  Boston, MA 02110-1335, USA.
+}
+
+program eawparser;
+
+{$mode objfpc}{$H+}
+
+uses
+  SysUtils, StrUtils;
+
+type
+  TEastAsianWidth = (
+    eawN,
+    eawA,
+    eawF,
+    eawH,
+    eawNa,
+    eawW);
+
+  TRange = record
+    RangeLo, RangeHi: UCS4Char;
+  end;
+  TRanges = array of TRange;
+
+var
+  EastAsianWidths: array [UCS4Char] of TEastAsianWidth;
+  EAWStats: array [TEastAsianWidth] of record
+    Exists: Boolean;
+    Handled: Boolean;
+    MinValue: UCS4Char;
+    MaxValue: UCS4Char;
+    Count: LongInt;
+    Ranges: TRanges;
+  end;
+
+function ParseEastAsianWidth(S: string): TEastAsianWidth;
+begin
+  S := Trim(S);
+  case S of
+    'N':
+      Result := eawN;
+    'A':
+      Result := eawA;
+    'F':
+      Result := eawF;
+    'H':
+      Result := eawH;
+    'Na':
+      Result := eawNa;
+    'W':
+      Result := eawW;
+    else
+      raise EArgumentException('Unknown east asian width: ''' + S + '''');
+  end;
+end;
+
+procedure ParseRange(S: string; out RangeLo, RangeHi: UCS4Char);
+var
+  dp: SizeInt;
+begin
+  S := Trim(S);
+  dp := Pos('..', S);
+  if dp > 0 then
+  begin
+    RangeLo := StrToInt('$' + LeftStr(S, dp - 1));
+    RangeHi := StrToInt('$' + Copy(S, dp + 2, Length(S) - dp + 3));
+  end
+  else
+  begin
+    RangeLo := StrToInt('$' + S);
+    RangeHi := RangeLo;
+  end;
+end;
+
+procedure ParseEastAsianWidths(const FileName: string);
+var
+  InF: TextFile;
+  S: string;
+  SplitS: TStringArray;
+  LineNr: Integer = 0;
+  eaw: TEastAsianWidth;
+  RangeLo, RangeHi, R: UCS4Char;
+begin
+  { - All code points, assigned or unassigned, that are not listed
+      explicitly are given the value "N". }
+  for R in UCS4Char do
+    EastAsianWidths[R] := eawN;
+  { - The unassigned code points in the following blocks default to "W":
+           CJK Unified Ideographs Extension A: U+3400..U+4DBF
+           CJK Unified Ideographs:             U+4E00..U+9FFF
+           CJK Compatibility Ideographs:       U+F900..U+FAFF }
+  for R := $3400 to $4DBF do
+    EastAsianWidths[R] := eawW;
+  for R := $4E00 to $9FFF do
+    EastAsianWidths[R] := eawW;
+  for R := $F900 to $FAFF do
+    EastAsianWidths[R] := eawW;
+  { - All undesignated code points in Planes 2 and 3, whether inside or
+        outside of allocated blocks, default to "W":
+           Plane 2:                            U+20000..U+2FFFD
+           Plane 3:                            U+30000..U+3FFFD }
+  for R := $20000 to $2FFFD do
+    EastAsianWidths[R] := eawW;
+  for R := $30000 to $3FFFD do
+    EastAsianWidths[R] := eawW;
+
+  if not FileExists(FileName) then
+  begin
+    Writeln('File doesn''t exist: ', FileName);
+    Halt(1);
+  end;
+  AssignFile(InF, FileName);
+  Reset(InF);
+  while not EoF(InF) do
+  begin
+    Inc(LineNr);
+    Readln(InF, S);
+    S := Trim(S);
+    if Pos('#', S) > 0 then
+      S := LeftStr(S, Pos('#', S) - 1);
+    if S <> '' then
+    begin
+      SplitS := S.Split([';']);
+      if Length(SplitS) <> 2 then
+        raise Exception.Create('Invalid number of ; separators on line ' + IntToStr(LineNr));
+      ParseRange(SplitS[0], RangeLo, RangeHi);
+      eaw := ParseEastAsianWidth(SplitS[1]);
+      for R := RangeLo to RangeHi do
+        EastAsianWidths[R] := eaw;
+    end;
+  end;
+  CloseFile(InF);
+end;
+
+procedure CalcStatsAndRanges;
+var
+  Ch: UCS4Char;
+  eaw, prev_eaw: TEastAsianWidth;
+begin
+  FillChar(EAWStats, SizeOf(EAWStats), 0);
+  eaw := Low(TEastAsianWidth);
+  for Ch := Low(UCS4Char) to High(UCS4Char) do
+  begin
+    prev_eaw := eaw;
+    eaw := EastAsianWidths[Ch];
+    with EAWStats[eaw] do
+    begin
+      if not Exists then
+      begin
+        Exists := True;
+        MinValue := Ch;
+        MaxValue := Ch;
+        Count := 1;
+        SetLength(Ranges, 1);
+        Ranges[0].RangeLo := Ch;
+        Ranges[0].RangeHi := Ch;
+      end
+      else
+      begin
+        MaxValue := Ch;
+        Inc(Count);
+        if prev_eaw <> eaw then
+        begin
+          SetLength(Ranges, Length(Ranges) + 1);
+          with Ranges[High(Ranges)] do
+          begin
+            RangeLo := Ch;
+            RangeHi := Ch;
+          end;
+        end
+        else
+          Ranges[High(Ranges)].RangeHi := Ch;
+      end;
+    end;
+  end;
+end;
+
+procedure MaybeCoalesceRanges(RLo, RHi: UCS4Char);
+var
+  eaw: TEastAsianWidth;
+  RI: Integer;
+begin
+  for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
+    if EAWStats[eaw].Exists and (not EAWStats[eaw].Handled) then
+    begin
+      for RI := 0 to High(EAWStats[eaw].Ranges) - 1 do
+        if (EAWStats[eaw].Ranges[RI].RangeHi = (RLo - 1)) and
+           (EAWStats[eaw].Ranges[RI + 1].RangeLo = (RHi + 1)) then
+        begin
+          EAWStats[eaw].Ranges[RI].RangeHi := EAWStats[eaw].Ranges[RI + 1].RangeHi;
+          Delete(EAWStats[eaw].Ranges, RI + 1, 1);
+          exit;
+        end;
+    end;
+end;
+
+function FindMinRangeCount: Integer;
+var
+  eaw: TEastAsianWidth;
+begin
+  Result := High(Integer);
+  for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
+    if EAWStats[eaw].Exists and (not EAWStats[eaw].Handled) and (Length(EAWStats[eaw].Ranges) < Result) then
+      Result := Length(EAWStats[eaw].Ranges);
+end;
+
+procedure GenCode(const OutFileName: string);
+const
+  RangeCountThreshold = 30{400};
+var
+  eaw: TEastAsianWidth;
+  RI, NextRangeCount: Integer;
+  OutFile: TextFile;
+begin
+  Writeln('Generating file: ', OutFileName);
+
+  AssignFile(OutFile, OutFileName);
+  Rewrite(OutFile);
+
+  Writeln(OutFile, '{ do not edit, this file is autogenerated by the eawparser tool }');
+
+  { unused properties are already handled }
+  for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
+    if not EAWStats[eaw].Exists then
+      EAWStats[eaw].Handled := True;
+
+  { handle single codepoints first }
+  for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
+    if (not EAWStats[eaw].Handled) and (EAWStats[eaw].Count = 1) then
+    begin
+      if EAWStats[eaw].MinValue <> EAWStats[eaw].MaxValue then
+        raise Exception.Create('Internal error');
+      Writeln(OutFile, 'if Ch=', EAWStats[eaw].MinValue, 'then result:=',eaw,' else');
+      EAWStats[eaw].Handled := True;
+      MaybeCoalesceRanges(EAWStats[eaw].MinValue, EAWStats[eaw].MaxValue);
+    end;
+
+  { handle single range codepoints next }
+  while FindMinRangeCount = 1 do
+    for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
+      if (not EAWStats[eaw].Handled) and (Length(EAWStats[eaw].Ranges) = 1) then
+      begin
+        Writeln(OutFile, 'if(Ch>=', EAWStats[eaw].MinValue, ')and(Ch<=', EAWStats[eaw].MaxValue, ')then result:=',eaw,' else');
+        EAWStats[eaw].Handled := True;
+        MaybeCoalesceRanges(EAWStats[eaw].MinValue, EAWStats[eaw].MaxValue);
+      end;
+
+  repeat
+    NextRangeCount := FindMinRangeCount;
+    if NextRangeCount <= RangeCountThreshold then
+      for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
+      begin
+        if not EAWStats[eaw].Handled and (Length(EAWStats[eaw].Ranges) <= NextRangeCount) then
+        begin
+          EAWStats[eaw].Handled := True;
+          Write(OutFile, 'if');
+          for RI := 0 to High(EAWStats[eaw].Ranges) do
+          begin
+            if RI <> 0 then
+              Writeln(OutFile, 'or');
+            with EAWStats[eaw].Ranges[RI] do
+            begin
+              if RangeLo = RangeHi then
+                Write(OutFile, '(Ch=', RangeLo, ')')
+              else
+                Write(OutFile, '((Ch>=', RangeLo, ')and(Ch<=', RangeHi, '))');
+              MaybeCoalesceRanges(RangeLo, RangeHi);
+            end;
+          end;
+          Writeln(OutFile, 'then result:=',eaw,' else');
+        end;
+      end;
+  until NextRangeCount > RangeCountThreshold;
+
+  if NextRangeCount <> High(Integer) then
+  begin
+    //for eaw := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
+    //  if not EAWStats[eaw].Handled then
+    //    Writeln(eaw, ' ', EAWStats[eaw].MinValue, '..', EAWStats[eaw].MaxValue, ' ', EAWStats[eaw].Count, ' ', Length(EAWStats[eaw].Ranges), ' ', (EAWStats[eaw].MaxValue - EAWStats[eaw].MinValue + 7) div 8);
+    Writeln(OutFile, 'case Ch of');
+    for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
+    begin
+      if not EAWStats[eaw].Handled then
+      begin
+        EAWStats[eaw].Handled := True;
+        for RI := 0 to High(EAWStats[eaw].Ranges) do
+        begin
+          if RI <> 0 then
+            Writeln(OutFile, ',');
+          with EAWStats[eaw].Ranges[RI] do
+          begin
+            if RangeLo = RangeHi then
+              Write(OutFile, RangeLo)
+            else
+              Write(OutFile, RangeLo, '..', RangeHi);
+          end;
+        end;
+        Writeln(OutFile, ':result:=', eaw, ';');
+      end;
+    end;
+    Writeln(OutFile, 'else result:=eawN end');
+  end
+  else
+    Writeln(OutFile, 'result:=eawN');
+
+  CloseFile(OutFile);
+end;
+
+begin
+  ParseEastAsianWidths('data/UCD/EastAsianWidth.txt');
+  CalcStatsAndRanges;
+  GenCode('eastasianwidth_code.inc');
+  Writeln('Done');
+end.
+

+ 1 - 0
utils/unicode/fpmake.pp

@@ -61,6 +61,7 @@ begin
     T:=P.Targets.AddProgram('cldrparser.lpr');
     T:=P.Targets.AddProgram('unihelper.lpr');
     T:=P.Targets.AddProgram('gbpparser.lpr');
+    T:=P.Targets.AddProgram('eawparser.lpr');
 
     end;
 end;