eawparser.lpr 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. { Parser and code generator for the EastAsianWidth.
  2. Copyright (C) 2021 Nikolay Nikolov <[email protected]>
  3. This source is free software; you can redistribute it and/or modify it under
  4. the terms of the GNU General Public License as published by the Free
  5. Software Foundation; either version 2 of the License, or (at your option)
  6. any later version.
  7. This code is distributed in the hope that it will be useful, but WITHOUT ANY
  8. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  9. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  10. details.
  11. A copy of the GNU General Public License is available on the World Wide Web
  12. at <http://www.gnu.org/copyleft/gpl.html>. You can also obtain it by writing
  13. to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
  14. Boston, MA 02110-1335, USA.
  15. }
  16. program eawparser;
  17. {$mode objfpc}{$H+}
  18. uses
  19. SysUtils, StrUtils;
  20. type
  21. TEastAsianWidth = (
  22. eawN,
  23. eawA,
  24. eawF,
  25. eawH,
  26. eawNa,
  27. eawW);
  28. TRange = record
  29. RangeLo, RangeHi: UCS4Char;
  30. end;
  31. TRanges = array of TRange;
  32. var
  33. EastAsianWidths: array [UCS4Char] of TEastAsianWidth;
  34. EAWStats: array [TEastAsianWidth] of record
  35. Exists: Boolean;
  36. Handled: Boolean;
  37. MinValue: UCS4Char;
  38. MaxValue: UCS4Char;
  39. Count: LongInt;
  40. Ranges: TRanges;
  41. end;
  42. function ParseEastAsianWidth(S: string): TEastAsianWidth;
  43. begin
  44. S := Trim(S);
  45. case S of
  46. 'N':
  47. Result := eawN;
  48. 'A':
  49. Result := eawA;
  50. 'F':
  51. Result := eawF;
  52. 'H':
  53. Result := eawH;
  54. 'Na':
  55. Result := eawNa;
  56. 'W':
  57. Result := eawW;
  58. else
  59. raise EArgumentException('Unknown east asian width: ''' + S + '''');
  60. end;
  61. end;
  62. procedure ParseRange(S: string; out RangeLo, RangeHi: UCS4Char);
  63. var
  64. dp: SizeInt;
  65. begin
  66. S := Trim(S);
  67. dp := Pos('..', S);
  68. if dp > 0 then
  69. begin
  70. RangeLo := StrToInt('$' + LeftStr(S, dp - 1));
  71. RangeHi := StrToInt('$' + Copy(S, dp + 2, Length(S) - dp + 3));
  72. end
  73. else
  74. begin
  75. RangeLo := StrToInt('$' + S);
  76. RangeHi := RangeLo;
  77. end;
  78. end;
  79. procedure ParseEastAsianWidths(const FileName: string);
  80. var
  81. InF: TextFile;
  82. S: string;
  83. SplitS: TStringArray;
  84. LineNr: Integer = 0;
  85. eaw: TEastAsianWidth;
  86. RangeLo, RangeHi, R: UCS4Char;
  87. begin
  88. { - All code points, assigned or unassigned, that are not listed
  89. explicitly are given the value "N". }
  90. for R in UCS4Char do
  91. EastAsianWidths[R] := eawN;
  92. { - The unassigned code points in the following blocks default to "W":
  93. CJK Unified Ideographs Extension A: U+3400..U+4DBF
  94. CJK Unified Ideographs: U+4E00..U+9FFF
  95. CJK Compatibility Ideographs: U+F900..U+FAFF }
  96. for R := $3400 to $4DBF do
  97. EastAsianWidths[R] := eawW;
  98. for R := $4E00 to $9FFF do
  99. EastAsianWidths[R] := eawW;
  100. for R := $F900 to $FAFF do
  101. EastAsianWidths[R] := eawW;
  102. { - All undesignated code points in Planes 2 and 3, whether inside or
  103. outside of allocated blocks, default to "W":
  104. Plane 2: U+20000..U+2FFFD
  105. Plane 3: U+30000..U+3FFFD }
  106. for R := $20000 to $2FFFD do
  107. EastAsianWidths[R] := eawW;
  108. for R := $30000 to $3FFFD do
  109. EastAsianWidths[R] := eawW;
  110. if not FileExists(FileName) then
  111. begin
  112. Writeln('File doesn''t exist: ', FileName);
  113. Halt(1);
  114. end;
  115. AssignFile(InF, FileName);
  116. Reset(InF);
  117. while not EoF(InF) do
  118. begin
  119. Inc(LineNr);
  120. Readln(InF, S);
  121. S := Trim(S);
  122. if Pos('#', S) > 0 then
  123. S := LeftStr(S, Pos('#', S) - 1);
  124. if S <> '' then
  125. begin
  126. SplitS := S.Split([';']);
  127. if Length(SplitS) <> 2 then
  128. raise Exception.Create('Invalid number of ; separators on line ' + IntToStr(LineNr));
  129. ParseRange(SplitS[0], RangeLo, RangeHi);
  130. eaw := ParseEastAsianWidth(SplitS[1]);
  131. for R := RangeLo to RangeHi do
  132. EastAsianWidths[R] := eaw;
  133. end;
  134. end;
  135. CloseFile(InF);
  136. end;
  137. procedure CalcStatsAndRanges;
  138. var
  139. Ch: UCS4Char;
  140. eaw, prev_eaw: TEastAsianWidth;
  141. begin
  142. FillChar(EAWStats, SizeOf(EAWStats), 0);
  143. eaw := Low(TEastAsianWidth);
  144. for Ch := Low(UCS4Char) to High(UCS4Char) do
  145. begin
  146. prev_eaw := eaw;
  147. eaw := EastAsianWidths[Ch];
  148. with EAWStats[eaw] do
  149. begin
  150. if not Exists then
  151. begin
  152. Exists := True;
  153. MinValue := Ch;
  154. MaxValue := Ch;
  155. Count := 1;
  156. SetLength(Ranges, 1);
  157. Ranges[0].RangeLo := Ch;
  158. Ranges[0].RangeHi := Ch;
  159. end
  160. else
  161. begin
  162. MaxValue := Ch;
  163. Inc(Count);
  164. if prev_eaw <> eaw then
  165. begin
  166. SetLength(Ranges, Length(Ranges) + 1);
  167. with Ranges[High(Ranges)] do
  168. begin
  169. RangeLo := Ch;
  170. RangeHi := Ch;
  171. end;
  172. end
  173. else
  174. Ranges[High(Ranges)].RangeHi := Ch;
  175. end;
  176. end;
  177. end;
  178. end;
  179. procedure MaybeCoalesceRanges(RLo, RHi: UCS4Char);
  180. var
  181. eaw: TEastAsianWidth;
  182. RI: Integer;
  183. begin
  184. for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
  185. if EAWStats[eaw].Exists and (not EAWStats[eaw].Handled) then
  186. begin
  187. for RI := 0 to High(EAWStats[eaw].Ranges) - 1 do
  188. if (EAWStats[eaw].Ranges[RI].RangeHi = (RLo - 1)) and
  189. (EAWStats[eaw].Ranges[RI + 1].RangeLo = (RHi + 1)) then
  190. begin
  191. EAWStats[eaw].Ranges[RI].RangeHi := EAWStats[eaw].Ranges[RI + 1].RangeHi;
  192. Delete(EAWStats[eaw].Ranges, RI + 1, 1);
  193. exit;
  194. end;
  195. end;
  196. end;
  197. function FindMinRangeCount: Integer;
  198. var
  199. eaw: TEastAsianWidth;
  200. begin
  201. Result := High(Integer);
  202. for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
  203. if EAWStats[eaw].Exists and (not EAWStats[eaw].Handled) and (Length(EAWStats[eaw].Ranges) < Result) then
  204. Result := Length(EAWStats[eaw].Ranges);
  205. end;
  206. procedure GenCode(const OutFileName: string);
  207. const
  208. RangeCountThreshold = 30{400};
  209. var
  210. eaw: TEastAsianWidth;
  211. RI, NextRangeCount: Integer;
  212. OutFile: TextFile;
  213. begin
  214. Writeln('Generating file: ', OutFileName);
  215. AssignFile(OutFile, OutFileName);
  216. Rewrite(OutFile);
  217. Writeln(OutFile, '{ do not edit, this file is autogenerated by the eawparser tool }');
  218. { unused properties are already handled }
  219. for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
  220. if not EAWStats[eaw].Exists then
  221. EAWStats[eaw].Handled := True;
  222. { handle single codepoints first }
  223. for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
  224. if (not EAWStats[eaw].Handled) and (EAWStats[eaw].Count = 1) then
  225. begin
  226. if EAWStats[eaw].MinValue <> EAWStats[eaw].MaxValue then
  227. raise Exception.Create('Internal error');
  228. Writeln(OutFile, 'if Ch=', EAWStats[eaw].MinValue, 'then result:=',eaw,' else');
  229. EAWStats[eaw].Handled := True;
  230. MaybeCoalesceRanges(EAWStats[eaw].MinValue, EAWStats[eaw].MaxValue);
  231. end;
  232. { handle single range codepoints next }
  233. while FindMinRangeCount = 1 do
  234. for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
  235. if (not EAWStats[eaw].Handled) and (Length(EAWStats[eaw].Ranges) = 1) then
  236. begin
  237. Writeln(OutFile, 'if(Ch>=', EAWStats[eaw].MinValue, ')and(Ch<=', EAWStats[eaw].MaxValue, ')then result:=',eaw,' else');
  238. EAWStats[eaw].Handled := True;
  239. MaybeCoalesceRanges(EAWStats[eaw].MinValue, EAWStats[eaw].MaxValue);
  240. end;
  241. repeat
  242. NextRangeCount := FindMinRangeCount;
  243. if NextRangeCount <= RangeCountThreshold then
  244. for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
  245. begin
  246. if not EAWStats[eaw].Handled and (Length(EAWStats[eaw].Ranges) <= NextRangeCount) then
  247. begin
  248. EAWStats[eaw].Handled := True;
  249. Write(OutFile, 'if');
  250. for RI := 0 to High(EAWStats[eaw].Ranges) do
  251. begin
  252. if RI <> 0 then
  253. Writeln(OutFile, 'or');
  254. with EAWStats[eaw].Ranges[RI] do
  255. begin
  256. if RangeLo = RangeHi then
  257. Write(OutFile, '(Ch=', RangeLo, ')')
  258. else
  259. Write(OutFile, '((Ch>=', RangeLo, ')and(Ch<=', RangeHi, '))');
  260. MaybeCoalesceRanges(RangeLo, RangeHi);
  261. end;
  262. end;
  263. Writeln(OutFile, 'then result:=',eaw,' else');
  264. end;
  265. end;
  266. until NextRangeCount > RangeCountThreshold;
  267. if NextRangeCount <> High(Integer) then
  268. begin
  269. //for eaw := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
  270. // if not EAWStats[eaw].Handled then
  271. // Writeln(eaw, ' ', EAWStats[eaw].MinValue, '..', EAWStats[eaw].MaxValue, ' ', EAWStats[eaw].Count, ' ', Length(EAWStats[eaw].Ranges), ' ', (EAWStats[eaw].MaxValue - EAWStats[eaw].MinValue + 7) div 8);
  272. Writeln(OutFile, 'case Ch of');
  273. for eaw := Succ(Low(TEastAsianWidth)) to High(TEastAsianWidth) do
  274. begin
  275. if not EAWStats[eaw].Handled then
  276. begin
  277. EAWStats[eaw].Handled := True;
  278. for RI := 0 to High(EAWStats[eaw].Ranges) do
  279. begin
  280. if RI <> 0 then
  281. Writeln(OutFile, ',');
  282. with EAWStats[eaw].Ranges[RI] do
  283. begin
  284. if RangeLo = RangeHi then
  285. Write(OutFile, RangeLo)
  286. else
  287. Write(OutFile, RangeLo, '..', RangeHi);
  288. end;
  289. end;
  290. Writeln(OutFile, ':result:=', eaw, ';');
  291. end;
  292. end;
  293. Writeln(OutFile, 'else result:=eawN end');
  294. end
  295. else
  296. Writeln(OutFile, 'result:=eawN');
  297. CloseFile(OutFile);
  298. end;
  299. begin
  300. ParseEastAsianWidths('data/UCD/EastAsianWidth.txt');
  301. CalcStatsAndRanges;
  302. GenCode('eastasianwidth_code.inc');
  303. Writeln('Done');
  304. end.