unihelper.lpr 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. { Unicode tables parser.
  2. Copyright (c) 2012 by Inoussa OUEDRAOGO
  3. The source code is distributed under the Library GNU
  4. General Public License with the following modification:
  5. - object files and libraries linked into an application may be
  6. distributed without source code.
  7. If you didn't receive a copy of the file COPYING, contact:
  8. Free Software Foundation
  9. 675 Mass Ave
  10. Cambridge, MA 02139
  11. USA
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. }
  15. { This program generates tables as include-files for use
  16. with the unicode related sources. It expects the following
  17. unicode.org's files to be present in the same folder :
  18. * HangulSyllableType.txt
  19. * PropList.txt
  20. * UnicodeData.txt
  21. * allkeys.txt
  22. }
  23. {$DEFINE UCA_TEST}
  24. program unihelper;
  25. {$mode objfpc}{$H+}
  26. {$typedaddress on}
  27. uses
  28. SysUtils, Classes,
  29. helper, uca_test;
  30. const
  31. SUsage =
  32. 'This program generates tables as include-files for use ' + sLineBreak +
  33. ' with the unicode related sources. It expects the following ' + sLineBreak +
  34. ' unicode.org''s files to be present in the same folder : ' + sLineBreak +
  35. ' * HangulSyllableType.txt ' + sLineBreak +
  36. ' * PropList.txt ' + sLineBreak +
  37. ' * UnicodeData.txt ' + sLineBreak +
  38. ' * allkeys.txt : Note that this file is the one provided for the CLDR root.' + sLineBreak +
  39. '' + sLineBreak +
  40. 'Usage : unihelper [<dataDir> <outputDir>] ' + sLineBreak +
  41. ' where ' + sLineBreak +
  42. ' dataDir : the directory where are stored the unicode files. The default' + sLineBreak +
  43. ' value is the program''s directory.' + sLineBreak +
  44. ' outputDir : The directory where the generated files will be stored. The' + sLineBreak +
  45. ' default value is the program''s directory.'+sLineBreak;
  46. function DumpCodePoint(ACodePoint : TCodePointRec) : string;
  47. begin
  48. Result := '';
  49. if (ACodePoint.LineType = 0) then
  50. WriteStr(Result,IntToHex(ACodePoint.CodePoint,4))
  51. else
  52. WriteStr(Result,IntToHex(ACodePoint.StartCodePoint,4),'..',IntToHex(ACodePoint.EndCodePoint,4));
  53. end;
  54. var
  55. dataPath, outputPath : string;
  56. stream, binStreamNE, binStreamOE, tmpStream : TMemoryStream;
  57. binaryStreamNE, binaryStreamOE : TMemoryStream;
  58. hangulSyllables : TCodePointRecArray;
  59. ucaBook : TUCA_DataBook;
  60. ucaPropBook : PUCA_PropBook;
  61. propList : TPropListLineRecArray;
  62. whiteSpaceCodePoints : TCodePointRecArray;
  63. props : TPropRecArray;
  64. numericTable : TNumericValueArray;
  65. decomposition : TDecompositionArray;
  66. decompositionBook : TDecompositionBook;
  67. data : TDataLineRecArray;
  68. //----------------
  69. lvl3table1 : T3lvlBmp1Table;
  70. lvl3table2 : T3lvlBmp2Table;
  71. lvl3table3 : T3lvlBmp3Table;
  72. //----------------
  73. s : ansistring;
  74. i, k, h : Integer;
  75. p : PDataLineRec;
  76. r : TDataLineRecArray;
  77. olvl3table1 : T3lvlOBmp1Table;
  78. olvl3table2 : T3lvlOBmp2Table;
  79. olvl3table3 : T3lvlOBmp3Table;
  80. //----------------
  81. hs, ls : Word;
  82. ucaFirstTable : TucaBmpFirstTable;
  83. ucaSecondTable : TucaBmpSecondTable;
  84. ucaoFirstTable : TucaoBmpFirstTable;
  85. ucaoSecondTable : TucaOBmpSecondTable;
  86. WL : Integer;
  87. serializedHeader : TSerializedCollationHeader;
  88. begin
  89. WriteLn(SUsage+sLineBreak);
  90. if (ParamCount > 0) then
  91. dataPath := IncludeTrailingPathDelimiter(ParamStr(1))
  92. else
  93. dataPath := ExtractFilePath(ParamStr(0));
  94. if (ParamCount > 1) then
  95. outputPath := IncludeTrailingPathDelimiter(ParamStr(2))
  96. else
  97. outputPath := dataPath;
  98. if not DirectoryExists(outputPath) then begin
  99. WriteLn('Directory not found : ',outputPath);
  100. if ForceDirectories(outputPath) then begin
  101. WriteLn(' directory created successfully');
  102. end else begin
  103. WriteLn(' fail to create directory.');
  104. Halt(1);
  105. end;
  106. end;
  107. if not(
  108. FileExists(dataPath + 'HangulSyllableType.txt') and
  109. FileExists(dataPath + 'PropList.txt') and
  110. FileExists(dataPath + 'UnicodeData.txt') and
  111. FileExists(dataPath + 'allkeys.txt')
  112. )
  113. then begin
  114. WriteLn('File(s) not found : HangulSyllableType.txt or PropList.txt or UnicodeData.txt or allkeys.txt .');
  115. Halt(1);
  116. end;
  117. binaryStreamNE := nil;
  118. binaryStreamOE := nil;
  119. binStreamOE := nil;
  120. binStreamNE := nil;
  121. tmpStream := nil;
  122. stream := TMemoryStream.Create();
  123. try
  124. binStreamNE := TMemoryStream.Create();
  125. binStreamOE := TMemoryStream.Create();
  126. tmpStream := TMemoryStream.Create();
  127. WriteLn('Load file HangulSyllableType.txt ...', DateTimeToStr(Now));
  128. stream.LoadFromFile(dataPath + 'HangulSyllableType.txt');
  129. stream.Position := 0;
  130. hangulSyllables := nil;
  131. ParseHangulSyllableTypes(stream,hangulSyllables);
  132. stream.Clear();
  133. WriteLn('Load file PropList.txt ...', DateTimeToStr(Now));
  134. stream.LoadFromFile(dataPath + 'PropList.txt');
  135. stream.Position := 0;
  136. propList := nil;
  137. ParseProps(stream,propList);
  138. stream.Clear();
  139. whiteSpaceCodePoints := FindCodePointsByProperty('White_Space',propList);
  140. writeln(' PropList Length = ',Length(propList));
  141. writeln(' White_Space Length = ',Length(whiteSpaceCodePoints));
  142. for i := Low(whiteSpaceCodePoints) to High(whiteSpaceCodePoints) do
  143. WriteLn(' ',DumpCodePoint(whiteSpaceCodePoints[i]):12,' , IsWhiteSpace = ',IsWhiteSpace(whiteSpaceCodePoints[i].CodePoint,whiteSpaceCodePoints));
  144. WriteLn('Load file UnicodeData.txt ...', DateTimeToStr(Now));
  145. stream.LoadFromFile(dataPath + 'UnicodeData.txt');
  146. stream.Position := 0;
  147. WriteLn('Parse file ...', DateTimeToStr(Now));
  148. data := nil;
  149. props := nil;
  150. Parse_UnicodeData(stream,props,numericTable,data,decomposition,hangulSyllables,whiteSpaceCodePoints);
  151. WriteLn('Decomposition building ...');
  152. MakeDecomposition(decomposition,decompositionBook);
  153. WriteLn('Load file UCA allkeys.txt ...', DateTimeToStr(Now));
  154. stream.LoadFromFile(dataPath + 'allkeys.txt');
  155. stream.Position := 0;
  156. ParseUCAFile(stream,ucaBook);
  157. { $IFDEF UCA_TEST}
  158. k := 0; WL := 0; ;
  159. for i := 0 to Length(ucaBook.Lines) - 1 do begin
  160. h := GetPropID(ucaBook.Lines[i].CodePoints[0],data);
  161. if (h <> -1) and
  162. ({props[h].HangulSyllable or} (props[h].DecompositionID <> -1))
  163. then begin
  164. Inc(k);
  165. ucaBook.Lines[i].Stored := False;
  166. end else begin
  167. ucaBook.Lines[i].Stored := True;
  168. if Length(ucaBook.Lines[i].Weights) > WL then
  169. WL := Length(ucaBook.Lines[i].Weights);
  170. end;
  171. end;
  172. WriteLn(
  173. 'UCA, Version = ',ucaBook.Version,'; entries count = ',Length(ucaBook.Lines),' ; Hangul # = ',k,
  174. 'Max Weights Length = ',WL
  175. );
  176. { $ENDIF UCA_TEST}
  177. WriteLn('Construct UCA Property Book ...');
  178. ucaPropBook := nil;
  179. MakeUCA_Props(@ucaBook,ucaPropBook);
  180. {$IFDEF UCA_TEST}
  181. uca_CheckProp_1(ucaBook,ucaPropBook);
  182. uca_CheckProp_x(ucaBook,ucaPropBook);
  183. {$ENDIF UCA_TEST}
  184. WriteLn('Construct UCA BMP tables ...');
  185. MakeUCA_BmpTables(ucaFirstTable,ucaSecondTable,ucaPropBook);
  186. WriteLn(' UCA BMP Second Table Length = ',Length(ucaSecondTable));
  187. {$IFDEF UCA_TEST}
  188. uca_CheckProp_1y(ucaBook,ucaPropBook,@ucaFirstTable,@ucaSecondTable);
  189. {$ENDIF UCA_TEST}
  190. WriteLn('Construct UCA OBMP tables ...');
  191. MakeUCA_OBmpTables(ucaoFirstTable,ucaoSecondTable,ucaPropBook);
  192. WriteLn(' UCA OBMP Second Table Length = ',Length(ucaoSecondTable));
  193. {$IFDEF UCA_TEST}
  194. uca_CheckProp_2y(ucaBook,ucaPropBook,@ucaoFirstTable,@ucaoSecondTable);
  195. {$ENDIF UCA_TEST}
  196. binaryStreamNE := TMemoryStream.Create();
  197. binaryStreamOE := TMemoryStream.Create();
  198. WriteLn('Generate UCA Props tables ...');
  199. binStreamNE.Clear();
  200. binStreamOE.Clear();
  201. GenerateLicenceText(binStreamNE);
  202. GenerateLicenceText(binStreamOE);
  203. GenerateUCA_PropTable(binStreamNE,ucaPropBook,ENDIAN_NATIVE);
  204. GenerateUCA_PropTable(binStreamOE,ucaPropBook,ENDIAN_NON_NATIVE);
  205. WriteLn('Generate UCA BMP tables ...');
  206. stream.Clear();
  207. GenerateLicenceText(stream);
  208. GenerateUCA_Head(stream,@ucaBook,ucaPropBook);
  209. GenerateUCA_BmpTables(stream,binStreamNE,binStreamOE,ucaFirstTable,ucaSecondTable);
  210. WriteLn('Generate UCA OBMP tables ...');
  211. GenerateUCA_OBmpTables(stream,binStreamNE,binStreamOE,ucaoFirstTable,ucaoSecondTable);
  212. stream.SaveToFile(outputPath + 'ucadata.inc');
  213. s := outputPath + 'ucadata.inc';
  214. binStreamNE.SaveToFile(GenerateEndianIncludeFileName(s,ENDIAN_NATIVE));
  215. binStreamOE.SaveToFile(GenerateEndianIncludeFileName(s,ENDIAN_NON_NATIVE));
  216. binStreamNE.Clear();
  217. binStreamOE.Clear();
  218. // Binary DUCET
  219. FillChar(serializedHeader,SizeOf(TSerializedCollationHeader),0);
  220. serializedHeader.Version := ucaBook.Version;
  221. serializedHeader.CollationName := 'DUCET';//'Default Unicode Collation Element Table (DUCET)';
  222. serializedHeader.VariableWeight := Ord(ucaBook.VariableWeight);
  223. SetBit(serializedHeader.Backwards,0,ucaBook.Backwards[0]);
  224. SetBit(serializedHeader.Backwards,1,ucaBook.Backwards[1]);
  225. SetBit(serializedHeader.Backwards,2,ucaBook.Backwards[2]);
  226. SetBit(serializedHeader.Backwards,3,ucaBook.Backwards[3]);
  227. serializedHeader.BMP_Table1Length := Length(ucaFirstTable);
  228. serializedHeader.BMP_Table2Length := Length(TucaBmpSecondTableItem) *
  229. (Length(ucaSecondTable) * SizeOf(UInt24));
  230. serializedHeader.OBMP_Table1Length := Length(ucaoFirstTable) * SizeOf(Word);
  231. serializedHeader.OBMP_Table2Length := Length(TucaOBmpSecondTableItem) *
  232. (Length(ucaoSecondTable) * SizeOf(UInt24));
  233. serializedHeader.PropCount := ucaPropBook^.ItemSize;
  234. serializedHeader.VariableLowLimit := ucaPropBook^.VariableLowLimit;
  235. serializedHeader.VariableHighLimit := ucaPropBook^.VariableHighLimit;
  236. binaryStreamNE.Write(serializedHeader,SizeOf(serializedHeader));
  237. ReverseRecordBytes(serializedHeader);
  238. binaryStreamOE.Write(serializedHeader,SizeOf(serializedHeader));
  239. GenerateBinaryUCA_BmpTables(binaryStreamNE,binaryStreamOE,ucaFirstTable,ucaSecondTable);
  240. GenerateBinaryUCA_OBmpTables(binaryStreamNE,binaryStreamOE,ucaoFirstTable,ucaoSecondTable);
  241. GenerateBinaryUCA_PropTable(binaryStreamNE,binaryStreamOE,ucaPropBook);
  242. binaryStreamNE.SaveToFile(
  243. outputPath + Format('collation_ducet_%s.bco',[ENDIAN_SUFFIX[ENDIAN_NATIVE]])
  244. );
  245. binaryStreamOE.SaveToFile(
  246. outputPath + Format('collation_ducet_%s.bco',[ENDIAN_SUFFIX[ENDIAN_NON_NATIVE]])
  247. );
  248. // Binary DUCET - END
  249. stream.Clear();
  250. GenerateLicenceText(stream);
  251. WriteLn('File parsed ...', DateTimeToStr(Now));
  252. WriteLn(' Props Len = ',Length(props));
  253. WriteLn(' Data Len = ',Length(data));
  254. {WriteLn('BMP Tables building ...', DateTimeToStr(Now));
  255. MakeBmpTables(firstTable,secondTable,props,data);
  256. WriteLn(' First Table length = ',Length(firstTable));
  257. WriteLn(' Second Table length = ',Length(secondTable));}
  258. WriteLn('BMP Tables building ...', DateTimeToStr(Now));
  259. MakeBmpTables3Levels(lvl3table1,lvl3table2,lvl3table3,data);
  260. WriteLn(' 3 Levels Tables :');
  261. WriteLn(' Len 1 = ',Length(lvl3table1));
  262. WriteLn(' Len 2 = ',Length(lvl3table2));
  263. WriteLn(' Len 3 = ',Length(lvl3table3));
  264. for i := 0 to 255 do begin
  265. for k := 0 to 15 do begin
  266. for h := 0 to 15 do begin
  267. if lvl3table3[lvl3table2[lvl3table1[i]][k]][h] <>
  268. GetPropID(256*i + 16*k +h,data)
  269. then begin
  270. writeln('3 levels errors, i=',i,'; k=',k,'; h=',h);
  271. end;
  272. end;
  273. end;
  274. end;
  275. binStreamNE.Clear();
  276. binStreamOE.Clear();
  277. WriteLn('Source generation ...', DateTimeToStr(Now));
  278. WriteLn('BMP Tables sources ...', DateTimeToStr(Now));
  279. Generate3lvlBmpTables(stream,lvl3table1,lvl3table2,lvl3table3);
  280. WriteLn('Properties Table sources ...', DateTimeToStr(Now));
  281. tmpStream.Clear();
  282. GenerateNumericTable(tmpStream,numericTable,True);
  283. tmpStream.SaveToFile(outputPath + 'unicodenumtable.pas');
  284. tmpStream.Clear();
  285. GeneratePropTable(binStreamNE,props,ENDIAN_NATIVE);
  286. GeneratePropTable(binStreamOE,props,ENDIAN_NON_NATIVE);
  287. //-------------------------------------------
  288. r := Compress(data);
  289. //-------------------
  290. WriteLn('OBMP Tables building ...', DateTimeToStr(Now));
  291. MakeOBmpTables3Levels(olvl3table1,olvl3table2,olvl3table3,r);
  292. WriteLn(' 3 Levels Tables :');
  293. WriteLn(' Len 1 = ',Length(olvl3table1));
  294. WriteLn(' Len 2 = ',Length(olvl3table2));
  295. WriteLn(' Len 3 = ',Length(olvl3table3));
  296. for i := 0 to 1023 do begin
  297. for k := 0 to 31 do begin
  298. for h := 0 to 31 do begin
  299. if olvl3table3[olvl3table2[olvl3table1[i]][k]][h] <>
  300. GetPropID(ToUCS4(HIGH_SURROGATE_BEGIN + i,LOW_SURROGATE_BEGIN + (k*32) + h),data)
  301. then begin
  302. writeln('3, OBMP levels errors, i=',i,'; k=',k,'; h=',h);
  303. end;
  304. end;
  305. end;
  306. end;
  307. WriteLn('OBMP Tables sources ...', DateTimeToStr(Now));
  308. Generate3lvlOBmpTables(stream,olvl3table1,olvl3table2,olvl3table3);
  309. //---------------------
  310. WriteLn('Decomposition Table sources ...', DateTimeToStr(Now));
  311. GenerateDecompositionBookTable(binStreamNE,decompositionBook,ENDIAN_NATIVE);
  312. GenerateDecompositionBookTable(binStreamOE,decompositionBook,ENDIAN_NON_NATIVE);
  313. stream.SaveToFile(outputPath + 'unicodedata.inc');
  314. binStreamNE.SaveToFile(outputPath + 'unicodedata_'+ENDIAN_SUFFIX[ENDIAN_NATIVE]+'.inc');
  315. binStreamOE.SaveToFile(outputPath + 'unicodedata_'+ENDIAN_SUFFIX[ENDIAN_NON_NATIVE]+'.inc');
  316. binStreamNE.Clear();
  317. binStreamOE.Clear();
  318. h := -1;
  319. for i := Low(data) to High(data) do
  320. if (data[i].CodePoint > $FFFF) then begin
  321. h := i;
  322. Break;
  323. end;
  324. stream.Clear();
  325. for i := h to High(data) do begin
  326. p := @data[i];
  327. if (p^.LineType = 0) then begin
  328. FromUCS4(p^.CodePoint,hs,ls);
  329. //k := GetProp(hs,ls,props,ofirstTable,osecondTable)^.PropID;
  330. k := GetProp(
  331. (hs-HIGH_SURROGATE_BEGIN),(ls-LOW_SURROGATE_BEGIN),
  332. props,olvl3table1,olvl3table2,olvl3table3
  333. )^.PropID;
  334. if (p^.PropID <> k) then begin
  335. s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
  336. stream.Write(s[1],Length(s));
  337. end;
  338. end else begin
  339. for h := p^.StartCodePoint to p^.EndCodePoint do begin
  340. FromUCS4(h,hs,ls);
  341. //k := GetProp(hs,ls,props,ofirstTable,osecondTable)^.PropID;
  342. k := GetProp(
  343. (hs-HIGH_SURROGATE_BEGIN),(ls-LOW_SURROGATE_BEGIN),
  344. props,olvl3table1,olvl3table2,olvl3table3
  345. )^.PropID;
  346. if (p^.PropID <> k) then begin
  347. s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
  348. stream.Write(s[1],Length(s));
  349. Break
  350. end;
  351. end;
  352. end;
  353. end;
  354. stream.SaveToFile(outputPath + 'diff-obmp.txt');
  355. stream.Clear();
  356. for i := Low(data) to High(data) do begin
  357. p := @data[i];
  358. if (p^.LineType = 0) then begin
  359. k := GetPropID(p^.CodePoint,r);
  360. if (p^.PropID <> k) then begin
  361. s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
  362. stream.Write(s[1],Length(s));
  363. end;
  364. end else begin
  365. for h := p^.StartCodePoint to p^.EndCodePoint do begin
  366. k := GetPropID(h,r);
  367. if (p^.PropID <> k) then begin
  368. s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
  369. stream.Write(s[1],Length(s));
  370. Break
  371. end;
  372. end;
  373. end;
  374. end;
  375. stream.SaveToFile(outputPath + 'diff.txt');
  376. stream.Clear();
  377. for i := Low(r) to High(r) do begin
  378. p := @r[i];
  379. if (p^.LineType = 0) then begin
  380. k := GetPropID(p^.CodePoint,data);
  381. if (p^.PropID <> k) then begin
  382. s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
  383. stream.Write(s[1],Length(s));
  384. end;
  385. end else begin
  386. for h := p^.StartCodePoint to p^.EndCodePoint do begin
  387. k := GetPropID(h,r);
  388. if (p^.PropID <> k) then begin
  389. s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
  390. stream.Write(s[1],Length(s));
  391. Break
  392. end;
  393. end;
  394. end;
  395. end;
  396. stream.SaveToFile(outputPath + 'diff2.txt');
  397. finally
  398. binaryStreamOE.Free();
  399. binaryStreamNE.Free();
  400. tmpStream.Free();
  401. binStreamOE.Free();
  402. binStreamNE.Free();
  403. stream.Free();
  404. end;
  405. end.