2
0

gbpparser.lpr 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. { Parser and code generator for the GraphemeBreakProperty.
  2. Copyright (C) 2021 Nikolay Nikolov <[email protected]>
  3. This source is free software; you can redistribute it and/or modify it under
  4. the terms of the GNU General Public License as published by the Free
  5. Software Foundation; either version 2 of the License, or (at your option)
  6. any later version.
  7. This code is distributed in the hope that it will be useful, but WITHOUT ANY
  8. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  9. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  10. details.
  11. A copy of the GNU General Public License is available on the World Wide Web
  12. at <http://www.gnu.org/copyleft/gpl.html>. You can also obtain it by writing
  13. to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
  14. Boston, MA 02110-1335, USA.
  15. }
  16. program gbpparser;
  17. {$mode objfpc}{$H+}
  18. uses
  19. SysUtils, StrUtils;
  20. type
  21. TGraphemeBreakProperty = (
  22. gbpOther,
  23. gbpPrepend,
  24. gbpCR,
  25. gbpLF,
  26. gbpControl,
  27. gbpExtend,
  28. gpbRegional_Indicator,
  29. gbpSpacingMark,
  30. gbpL,
  31. gbpV,
  32. gbpT,
  33. gbpLV,
  34. gbpLVT,
  35. gbpE_Base,
  36. gbpE_Modifier,
  37. gbpZWJ,
  38. gbpGlue_After_Zwj,
  39. gbpE_Base_GAZ);
  40. TRange = record
  41. RangeLo, RangeHi: UCS4Char;
  42. end;
  43. TRanges = array of TRange;
  44. var
  45. GraphemeBreakProperties: array [UCS4Char] of TGraphemeBreakProperty;
  46. GBPStats: array [TGraphemeBreakProperty] of record
  47. Exists: Boolean;
  48. Handled: Boolean;
  49. MinValue: UCS4Char;
  50. MaxValue: UCS4Char;
  51. Count: LongInt;
  52. Ranges: TRanges;
  53. end;
  54. function ParseGraphemeBreakProperty(S: string): TGraphemeBreakProperty;
  55. begin
  56. S := Trim(S);
  57. case S of
  58. 'Prepend':
  59. Result := gbpPrepend;
  60. 'CR':
  61. Result := gbpCR;
  62. 'LF':
  63. Result := gbpLF;
  64. 'Control':
  65. Result := gbpControl;
  66. 'Extend':
  67. Result := gbpExtend;
  68. 'Regional_Indicator':
  69. Result := gpbRegional_Indicator;
  70. 'SpacingMark':
  71. Result := gbpSpacingMark;
  72. 'L':
  73. Result := gbpL;
  74. 'V':
  75. Result := gbpV;
  76. 'T':
  77. Result := gbpT;
  78. 'LV':
  79. Result := gbpLV;
  80. 'LVT':
  81. Result := gbpLVT;
  82. 'E_Base':
  83. Result := gbpE_Base;
  84. 'E_Modifier':
  85. Result := gbpE_Modifier;
  86. 'ZWJ':
  87. Result := gbpZWJ;
  88. 'Glue_After_Zwj':
  89. Result := gbpGlue_After_Zwj;
  90. 'E_Base_GAZ':
  91. Result := gbpE_Base_GAZ;
  92. else
  93. raise EArgumentException('Unknown grapheme break property: ''' + S + '''');
  94. end;
  95. end;
  96. procedure ParseRange(S: string; out RangeLo, RangeHi: UCS4Char);
  97. var
  98. dp: SizeInt;
  99. begin
  100. S := Trim(S);
  101. dp := Pos('..', S);
  102. if dp > 0 then
  103. begin
  104. RangeLo := StrToInt('$' + LeftStr(S, dp - 1));
  105. RangeHi := StrToInt('$' + Copy(S, dp + 2, Length(S) - dp + 3));
  106. end
  107. else
  108. begin
  109. RangeLo := StrToInt('$' + S);
  110. RangeHi := RangeLo;
  111. end;
  112. end;
  113. procedure ParseGraphemeBreakProperties(const FileName: string);
  114. var
  115. InF: TextFile;
  116. S: string;
  117. SplitS: TStringArray;
  118. LineNr: Integer = 0;
  119. gbp: TGraphemeBreakProperty;
  120. RangeLo, RangeHi, R: UCS4Char;
  121. begin
  122. if not FileExists(FileName) then
  123. begin
  124. Writeln('File doesn''t exist: ', FileName);
  125. Halt(1);
  126. end;
  127. AssignFile(InF, FileName);
  128. Reset(InF);
  129. while not EoF(InF) do
  130. begin
  131. Inc(LineNr);
  132. Readln(InF, S);
  133. S := Trim(S);
  134. if Pos('#', S) > 0 then
  135. S := LeftStr(S, Pos('#', S) - 1);
  136. if S <> '' then
  137. begin
  138. SplitS := S.Split([';']);
  139. if Length(SplitS) <> 2 then
  140. raise Exception.Create('Invalid number of ; separators on line ' + IntToStr(LineNr));
  141. ParseRange(SplitS[0], RangeLo, RangeHi);
  142. gbp := ParseGraphemeBreakProperty(SplitS[1]);
  143. for R := RangeLo to RangeHi do
  144. GraphemeBreakProperties[R] := gbp;
  145. end;
  146. end;
  147. CloseFile(InF);
  148. end;
  149. procedure CalcStatsAndRanges;
  150. var
  151. Ch: UCS4Char;
  152. gbp, prev_gbp: TGraphemeBreakProperty;
  153. begin
  154. FillChar(GBPStats, SizeOf(GBPStats), 0);
  155. gbp := Low(TGraphemeBreakProperty);
  156. for Ch := Low(UCS4Char) to High(UCS4Char) do
  157. begin
  158. prev_gbp := gbp;
  159. gbp := GraphemeBreakProperties[Ch];
  160. with GBPStats[gbp] do
  161. begin
  162. if not Exists then
  163. begin
  164. Exists := True;
  165. MinValue := Ch;
  166. MaxValue := Ch;
  167. Count := 1;
  168. SetLength(Ranges, 1);
  169. Ranges[0].RangeLo := Ch;
  170. Ranges[0].RangeHi := Ch;
  171. end
  172. else
  173. begin
  174. MaxValue := Ch;
  175. Inc(Count);
  176. if prev_gbp <> gbp then
  177. begin
  178. SetLength(Ranges, Length(Ranges) + 1);
  179. with Ranges[High(Ranges)] do
  180. begin
  181. RangeLo := Ch;
  182. RangeHi := Ch;
  183. end;
  184. end
  185. else
  186. Ranges[High(Ranges)].RangeHi := Ch;
  187. end;
  188. end;
  189. end;
  190. end;
  191. procedure MaybeCoalesceRanges(RLo, RHi: UCS4Char);
  192. var
  193. gbp: TGraphemeBreakProperty;
  194. RI: Integer;
  195. begin
  196. for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
  197. if GBPStats[gbp].Exists and (not GBPStats[gbp].Handled) then
  198. begin
  199. for RI := 0 to High(GBPStats[gbp].Ranges) - 1 do
  200. if (GBPStats[gbp].Ranges[RI].RangeHi = (RLo - 1)) and
  201. (GBPStats[gbp].Ranges[RI + 1].RangeLo = (RHi + 1)) then
  202. begin
  203. GBPStats[gbp].Ranges[RI].RangeHi := GBPStats[gbp].Ranges[RI + 1].RangeHi;
  204. Delete(GBPStats[gbp].Ranges, RI + 1, 1);
  205. exit;
  206. end;
  207. end;
  208. end;
  209. function FindMinRangeCount: Integer;
  210. var
  211. gbp: TGraphemeBreakProperty;
  212. begin
  213. Result := High(Integer);
  214. for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
  215. if GBPStats[gbp].Exists and (not GBPStats[gbp].Handled) and (Length(GBPStats[gbp].Ranges) < Result) then
  216. Result := Length(GBPStats[gbp].Ranges);
  217. end;
  218. function ApplyLV_LVTCompression: Boolean;
  219. const
  220. RangeLo = 44032;
  221. RangeHi = 55203;
  222. var
  223. Ch: UCS4Char;
  224. begin
  225. Result := False;
  226. if (GBPStats[gbpLV].MinValue <> RangeLo) or (GBPStats[gbpLV].MaxValue <> (RangeHi - 27)) or
  227. (GBPStats[gbpLVT].MinValue <> (RangeLo + 1)) or (GBPStats[gbpLVT].MaxValue <> RangeHi) then
  228. exit;
  229. for Ch := RangeLo to RangeHi do
  230. begin
  231. if ((Ch - RangeLo) mod 28) = 0 then
  232. begin
  233. if GraphemeBreakProperties[Ch] <> gbpLV then
  234. exit;
  235. end
  236. else
  237. begin
  238. if GraphemeBreakProperties[Ch] <> gbpLVT then
  239. exit;
  240. end;
  241. end;
  242. Result := True;
  243. end;
  244. procedure GenCode(const OutFileName: string);
  245. const
  246. RangeCountThreshold = 30{400};
  247. var
  248. gbp: TGraphemeBreakProperty;
  249. RI, NextRangeCount: Integer;
  250. OutFile: TextFile;
  251. begin
  252. Writeln('Generating file: ', OutFileName);
  253. AssignFile(OutFile, OutFileName);
  254. Rewrite(OutFile);
  255. Writeln(OutFile, '{ do not edit, this file is autogenerated by the gbpparser tool }');
  256. { unused properties are already handled }
  257. for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
  258. if not GBPStats[gbp].Exists then
  259. GBPStats[gbp].Handled := True;
  260. { handle single codepoints first }
  261. for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
  262. if (not GBPStats[gbp].Handled) and (GBPStats[gbp].Count = 1) then
  263. begin
  264. if GBPStats[gbp].MinValue <> GBPStats[gbp].MaxValue then
  265. raise Exception.Create('Internal error');
  266. Writeln(OutFile, 'if Ch=', GBPStats[gbp].MinValue, 'then result:=',gbp,' else');
  267. GBPStats[gbp].Handled := True;
  268. MaybeCoalesceRanges(GBPStats[gbp].MinValue, GBPStats[gbp].MaxValue);
  269. end;
  270. { handle single range codepoints next }
  271. while FindMinRangeCount = 1 do
  272. for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
  273. if (not GBPStats[gbp].Handled) and (Length(GBPStats[gbp].Ranges) = 1) then
  274. begin
  275. Writeln(OutFile, 'if(Ch>=', GBPStats[gbp].MinValue, ')and(Ch<=', GBPStats[gbp].MaxValue, ')then result:=',gbp,' else');
  276. GBPStats[gbp].Handled := True;
  277. MaybeCoalesceRanges(GBPStats[gbp].MinValue, GBPStats[gbp].MaxValue);
  278. end;
  279. if ApplyLV_LVTCompression then
  280. begin
  281. Writeln(OutFile, 'if(Ch>=44032)and(Ch<=55203)then begin if((Ch-44032)mod 28)=0then result:=gbpLV else result:=gbpLVT end else');
  282. GBPStats[gbpLV].Handled := True;
  283. GBPStats[gbpLVT].Handled := True;
  284. end;
  285. repeat
  286. NextRangeCount := FindMinRangeCount;
  287. if NextRangeCount <= RangeCountThreshold then
  288. for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
  289. begin
  290. if not GBPStats[gbp].Handled and (Length(GBPStats[gbp].Ranges) <= NextRangeCount) then
  291. begin
  292. GBPStats[gbp].Handled := True;
  293. Write(OutFile, 'if');
  294. for RI := 0 to High(GBPStats[gbp].Ranges) do
  295. begin
  296. if RI <> 0 then
  297. Writeln(OutFile, 'or');
  298. with GBPStats[gbp].Ranges[RI] do
  299. begin
  300. if RangeLo = RangeHi then
  301. Write(OutFile, '(Ch=', RangeLo, ')')
  302. else
  303. Write(OutFile, '((Ch>=', RangeLo, ')and(Ch<=', RangeHi, '))');
  304. MaybeCoalesceRanges(RangeLo, RangeHi);
  305. end;
  306. end;
  307. Writeln(OutFile, 'then result:=',gbp,' else');
  308. end;
  309. end;
  310. until NextRangeCount > RangeCountThreshold;
  311. if NextRangeCount <> High(Integer) then
  312. begin
  313. //for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
  314. // if not GBPStats[gbp].Handled then
  315. // Writeln(gbp, ' ', GBPStats[gbp].MinValue, '..', GBPStats[gbp].MaxValue, ' ', GBPStats[gbp].Count, ' ', Length(GBPStats[gbp].Ranges), ' ', (GBPStats[gbp].MaxValue - GBPStats[gbp].MinValue + 7) div 8);
  316. Writeln(OutFile, 'case Ch of');
  317. for gbp := Succ(Low(TGraphemeBreakProperty)) to High(TGraphemeBreakProperty) do
  318. begin
  319. if not GBPStats[gbp].Handled then
  320. begin
  321. GBPStats[gbp].Handled := True;
  322. for RI := 0 to High(GBPStats[gbp].Ranges) do
  323. begin
  324. if RI <> 0 then
  325. Writeln(OutFile, ',');
  326. with GBPStats[gbp].Ranges[RI] do
  327. begin
  328. if RangeLo = RangeHi then
  329. Write(OutFile, RangeLo)
  330. else
  331. Write(OutFile, RangeLo, '..', RangeHi);
  332. end;
  333. end;
  334. Writeln(OutFile, ':result:=', gbp, ';');
  335. end;
  336. end;
  337. Writeln(OutFile, 'else result:=gbpOther end');
  338. end
  339. else
  340. Writeln(OutFile, 'result:=gbpOther');
  341. CloseFile(OutFile);
  342. end;
  343. begin
  344. FillChar(GraphemeBreakProperties, SizeOf(GraphemeBreakProperties), 0);
  345. ParseGraphemeBreakProperties('data/UCD/auxiliary/GraphemeBreakProperty.txt');
  346. CalcStatsAndRanges;
  347. GenCode('graphemebreakproperty_code.inc');
  348. Writeln('Done');
  349. end.