unihelper.lpr 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. { Unicode tables parser.
  2. Copyright (c) 2012 by Inoussa OUEDRAOGO
  3. The source code is distributed under the Library GNU
  4. General Public License with the following modification:
  5. - object files and libraries linked into an application may be
  6. distributed without source code.
  7. If you didn't receive a copy of the file COPYING, contact:
  8. Free Software Foundation
  9. 675 Mass Ave
  10. Cambridge, MA 02139
  11. USA
  12. This program is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. }
  15. { This program generates tables as include-files for use
  16. with the unicode related sources. It expects the following
  17. unicode.org's files to be present in the same folder :
  18. * HangulSyllableType.txt
  19. * PropList.txt
  20. * UnicodeData.txt
  21. * allkeys.txt
  22. }
  23. {$DEFINE UCA_TEST}
  24. program unihelper;
  25. {$mode objfpc}{$H+}
  26. {$typedaddress on}
  27. uses
  28. SysUtils, Classes,
  29. helper, uca_test;
  30. const
  31. SUsage =
  32. 'This program generates tables as include-files for use ' + sLineBreak +
  33. ' with the unicode related sources. It expects the following ' + sLineBreak +
  34. ' unicode.org''s files to be present in the same folder : ' + sLineBreak +
  35. ' * HangulSyllableType.txt ' + sLineBreak +
  36. ' * PropList.txt ' + sLineBreak +
  37. ' * UnicodeData.txt ' + sLineBreak +
  38. ' * allkeys.txt : Note that this file is the one provided for the CLDR root.' + sLineBreak +
  39. '' + sLineBreak +
  40. 'Usage : unihelper [<dataDir> <outputDir>] ' + sLineBreak +
  41. ' where ' + sLineBreak +
  42. ' dataDir : the directory where are stored the unicode files. The default' + sLineBreak +
  43. ' value is the program''s directory.' + sLineBreak +
  44. ' outputDir : The directory where the generated files will be stored. The' + sLineBreak +
  45. ' default value is the program''s directory.'+sLineBreak;
  46. function DumpCodePoint(ACodePoint : TCodePointRec) : string;
  47. begin
  48. Result := '';
  49. if (ACodePoint.LineType = 0) then
  50. WriteStr(Result,IntToHex(ACodePoint.CodePoint,4))
  51. else
  52. WriteStr(Result,IntToHex(ACodePoint.StartCodePoint,4),'..',IntToHex(ACodePoint.EndCodePoint,4));
  53. end;
  54. var
  55. dataPath, outputPath : string;
  56. stream, binStreamNE, binStreamOE, tmpStream : TMemoryStream;
  57. binaryStreamNE, binaryStreamOE : TMemoryStream;
  58. hangulSyllables : TCodePointRecArray;
  59. ucaBook : TUCA_DataBook;
  60. ucaPropBook : PUCA_PropBook;
  61. propList : TPropListLineRecArray;
  62. whiteSpaceCodePoints : TCodePointRecArray;
  63. unifiedIdeographCodePoints : TCodePointRecArray;
  64. props : TPropRecArray;
  65. numericTable : TNumericValueArray;
  66. decomposition : TDecompositionArray;
  67. decompositionBook : TDecompositionBook;
  68. data : TDataLineRecArray;
  69. //----------------
  70. lvl3table1 : T3lvlBmp1Table;
  71. lvl3table2 : T3lvlBmp2Table;
  72. lvl3table3 : T3lvlBmp3Table;
  73. //----------------
  74. s : ansistring;
  75. i, k, h : Integer;
  76. p : PDataLineRec;
  77. r : TDataLineRecArray;
  78. olvl3table1 : T3lvlOBmp1Table;
  79. olvl3table2 : T3lvlOBmp2Table;
  80. olvl3table3 : T3lvlOBmp3Table;
  81. //----------------
  82. hs, ls : Word;
  83. ucaFirstTable : TucaBmpFirstTable;
  84. ucaSecondTable : TucaBmpSecondTable;
  85. ucaoFirstTable : TucaoBmpFirstTable;
  86. ucaoSecondTable : TucaOBmpSecondTable;
  87. WL : Integer;
  88. serializedHeader : TSerializedCollationHeader;
  89. begin
  90. WriteLn(SUsage+sLineBreak);
  91. if (ParamCount > 0) then
  92. dataPath := IncludeTrailingPathDelimiter(ParamStr(1))
  93. else
  94. dataPath := ExtractFilePath(ParamStr(0));
  95. if (ParamCount > 1) then
  96. outputPath := IncludeTrailingPathDelimiter(ParamStr(2))
  97. else
  98. outputPath := dataPath;
  99. if not DirectoryExists(outputPath) then begin
  100. WriteLn('Directory not found : ',outputPath);
  101. if ForceDirectories(outputPath) then begin
  102. WriteLn(' directory created successfully');
  103. end else begin
  104. WriteLn(' fail to create directory.');
  105. Halt(1);
  106. end;
  107. end;
  108. if not(
  109. FileExists(dataPath + 'HangulSyllableType.txt') and
  110. FileExists(dataPath + 'PropList.txt') and
  111. FileExists(dataPath + 'UnicodeData.txt') and
  112. FileExists(dataPath + 'allkeys.txt')
  113. )
  114. then begin
  115. WriteLn('File(s) not found : HangulSyllableType.txt or PropList.txt or UnicodeData.txt or allkeys.txt .');
  116. Halt(1);
  117. end;
  118. binaryStreamNE := nil;
  119. binaryStreamOE := nil;
  120. binStreamOE := nil;
  121. binStreamNE := nil;
  122. tmpStream := nil;
  123. stream := TMemoryStream.Create();
  124. try
  125. binStreamNE := TMemoryStream.Create();
  126. binStreamOE := TMemoryStream.Create();
  127. tmpStream := TMemoryStream.Create();
  128. WriteLn('Load file HangulSyllableType.txt ...', DateTimeToStr(Now));
  129. stream.LoadFromFile(dataPath + 'HangulSyllableType.txt');
  130. stream.Position := 0;
  131. hangulSyllables := nil;
  132. ParseHangulSyllableTypes(stream,hangulSyllables);
  133. stream.Clear();
  134. WriteLn('Load file PropList.txt ...', DateTimeToStr(Now));
  135. stream.LoadFromFile(dataPath + 'PropList.txt');
  136. stream.Position := 0;
  137. propList := nil;
  138. ParseProps(stream,propList);
  139. stream.Clear();
  140. whiteSpaceCodePoints := FindCodePointsByProperty('White_Space',propList);
  141. writeln(' PropList Length = ',Length(propList));
  142. writeln(' White_Space Length = ',Length(whiteSpaceCodePoints));
  143. for i := Low(whiteSpaceCodePoints) to High(whiteSpaceCodePoints) do
  144. WriteLn(' ',DumpCodePoint(whiteSpaceCodePoints[i]):12,' , IsWhiteSpace = ',IsWhiteSpace(whiteSpaceCodePoints[i].CodePoint,whiteSpaceCodePoints));
  145. unifiedIdeographCodePoints := FindCodePointsByProperty('Unified_Ideograph',propList);
  146. writeln(' Unified_Ideograph Length = ',Length(unifiedIdeographCodePoints));
  147. WriteLn('Load file UnicodeData.txt ...', DateTimeToStr(Now));
  148. stream.LoadFromFile(dataPath + 'UnicodeData.txt');
  149. stream.Position := 0;
  150. WriteLn('Parse file ...', DateTimeToStr(Now));
  151. data := nil;
  152. props := nil;
  153. Parse_UnicodeData(
  154. stream,props,numericTable,data,decomposition,hangulSyllables,
  155. whiteSpaceCodePoints,unifiedIdeographCodePoints
  156. );
  157. WriteLn('Decomposition building ...');
  158. MakeDecomposition(decomposition,decompositionBook);
  159. WriteLn('Load file UCA allkeys.txt ...', DateTimeToStr(Now));
  160. stream.LoadFromFile(dataPath + 'allkeys.txt');
  161. stream.Position := 0;
  162. ParseUCAFile(stream,ucaBook);
  163. { $IFDEF UCA_TEST}
  164. k := 0; WL := 0; ;
  165. for i := 0 to Length(ucaBook.Lines) - 1 do begin
  166. h := GetPropID(ucaBook.Lines[i].CodePoints[0],data);
  167. if (h <> -1) and
  168. ({props[h].HangulSyllable or} (props[h].DecompositionID <> -1))
  169. then begin
  170. Inc(k);
  171. ucaBook.Lines[i].Stored := False;
  172. end else begin
  173. ucaBook.Lines[i].Stored := True;
  174. if Length(ucaBook.Lines[i].Weights) > WL then
  175. WL := Length(ucaBook.Lines[i].Weights);
  176. end;
  177. end;
  178. WriteLn(
  179. 'UCA, Version = ',ucaBook.Version,
  180. '; entries count = ',Length(ucaBook.Lines),
  181. '; characters (Decomposition) count = ',k,
  182. '; Max Weights Length = ',WL
  183. );
  184. { $ENDIF UCA_TEST}
  185. WriteLn('Construct UCA Property Book ...');
  186. ucaPropBook := nil;
  187. MakeUCA_Props(@ucaBook,ucaPropBook);
  188. {$IFDEF UCA_TEST}
  189. uca_CheckProp_1(ucaBook,ucaPropBook);
  190. uca_CheckProp_x(ucaBook,ucaPropBook);
  191. {$ENDIF UCA_TEST}
  192. WriteLn('Construct UCA BMP tables ...');
  193. MakeUCA_BmpTables(ucaFirstTable,ucaSecondTable,ucaPropBook);
  194. WriteLn(' UCA BMP Second Table Length = ',Length(ucaSecondTable));
  195. {$IFDEF UCA_TEST}
  196. uca_CheckProp_1y(ucaBook,ucaPropBook,@ucaFirstTable,@ucaSecondTable);
  197. {$ENDIF UCA_TEST}
  198. WriteLn('Construct UCA OBMP tables ...');
  199. MakeUCA_OBmpTables(ucaoFirstTable,ucaoSecondTable,ucaPropBook);
  200. WriteLn(' UCA OBMP Second Table Length = ',Length(ucaoSecondTable));
  201. {$IFDEF UCA_TEST}
  202. uca_CheckProp_2y(ucaBook,ucaPropBook,@ucaoFirstTable,@ucaoSecondTable);
  203. {$ENDIF UCA_TEST}
  204. binaryStreamNE := TMemoryStream.Create();
  205. binaryStreamOE := TMemoryStream.Create();
  206. WriteLn('Generate UCA Props tables ...');
  207. binStreamNE.Clear();
  208. binStreamOE.Clear();
  209. GenerateLicenceText(binStreamNE);
  210. GenerateLicenceText(binStreamOE);
  211. GenerateUCA_PropTable(binStreamNE,ucaPropBook,ENDIAN_NATIVE);
  212. GenerateUCA_PropTable(binStreamOE,ucaPropBook,ENDIAN_NON_NATIVE);
  213. WriteLn('Generate UCA BMP tables ...');
  214. stream.Clear();
  215. GenerateLicenceText(stream);
  216. GenerateUCA_Head(stream,@ucaBook,ucaPropBook);
  217. GenerateUCA_BmpTables(stream,binStreamNE,binStreamOE,ucaFirstTable,ucaSecondTable);
  218. WriteLn('Generate UCA OBMP tables ...');
  219. GenerateUCA_OBmpTables(stream,binStreamNE,binStreamOE,ucaoFirstTable,ucaoSecondTable);
  220. stream.SaveToFile(outputPath + 'ucadata.inc');
  221. s := outputPath + 'ucadata.inc';
  222. binStreamNE.SaveToFile(GenerateEndianIncludeFileName(s,ENDIAN_NATIVE));
  223. binStreamOE.SaveToFile(GenerateEndianIncludeFileName(s,ENDIAN_NON_NATIVE));
  224. binStreamNE.Clear();
  225. binStreamOE.Clear();
  226. // Binary DUCET
  227. FillChar(serializedHeader,SizeOf(TSerializedCollationHeader),0);
  228. StringToByteArray(ucaBook.Version,serializedHeader.Version);
  229. StringToByteArray('DUCET',serializedHeader.CollationName); //'Default Unicode Collation Element Table (DUCET)';
  230. serializedHeader.VariableWeight := Ord(ucaBook.VariableWeight);
  231. SetBit(serializedHeader.Backwards,0,ucaBook.Backwards[0]);
  232. SetBit(serializedHeader.Backwards,1,ucaBook.Backwards[1]);
  233. SetBit(serializedHeader.Backwards,2,ucaBook.Backwards[2]);
  234. SetBit(serializedHeader.Backwards,3,ucaBook.Backwards[3]);
  235. serializedHeader.BMP_Table1Length := Length(ucaFirstTable);
  236. serializedHeader.BMP_Table2Length := Length(TucaBmpSecondTableItem) *
  237. (Length(ucaSecondTable) * SizeOf(UInt24));
  238. serializedHeader.OBMP_Table1Length := Length(ucaoFirstTable) * SizeOf(Word);
  239. serializedHeader.OBMP_Table2Length := Length(TucaOBmpSecondTableItem) *
  240. (Length(ucaoSecondTable) * SizeOf(UInt24));
  241. serializedHeader.PropCount := ucaPropBook^.ItemSize;
  242. serializedHeader.VariableLowLimit := ucaPropBook^.VariableLowLimit;
  243. serializedHeader.VariableHighLimit := ucaPropBook^.VariableHighLimit;
  244. binaryStreamNE.Write(serializedHeader,SizeOf(serializedHeader));
  245. ReverseRecordBytes(serializedHeader);
  246. binaryStreamOE.Write(serializedHeader,SizeOf(serializedHeader));
  247. GenerateBinaryUCA_BmpTables(binaryStreamNE,binaryStreamOE,ucaFirstTable,ucaSecondTable);
  248. GenerateBinaryUCA_OBmpTables(binaryStreamNE,binaryStreamOE,ucaoFirstTable,ucaoSecondTable);
  249. GenerateBinaryUCA_PropTable(binaryStreamNE,binaryStreamOE,ucaPropBook);
  250. binaryStreamNE.SaveToFile(
  251. outputPath + Format('collation_ducet_%s.bco',[ENDIAN_SUFFIX[ENDIAN_NATIVE]])
  252. );
  253. binaryStreamOE.SaveToFile(
  254. outputPath + Format('collation_ducet_%s.bco',[ENDIAN_SUFFIX[ENDIAN_NON_NATIVE]])
  255. );
  256. // Binary DUCET - END
  257. stream.Clear();
  258. GenerateLicenceText(stream);
  259. WriteLn('File parsed ...', DateTimeToStr(Now));
  260. WriteLn(' Props Len = ',Length(props));
  261. WriteLn(' Data Len = ',Length(data));
  262. {WriteLn('BMP Tables building ...', DateTimeToStr(Now));
  263. MakeBmpTables(firstTable,secondTable,props,data);
  264. WriteLn(' First Table length = ',Length(firstTable));
  265. WriteLn(' Second Table length = ',Length(secondTable));}
  266. WriteLn('BMP Tables building ...', DateTimeToStr(Now));
  267. MakeBmpTables3Levels(lvl3table1,lvl3table2,lvl3table3,data);
  268. WriteLn(' 3 Levels Tables :');
  269. WriteLn(' Len 1 = ',Length(lvl3table1));
  270. WriteLn(' Len 2 = ',Length(lvl3table2));
  271. WriteLn(' Len 3 = ',Length(lvl3table3));
  272. for i := 0 to 255 do begin
  273. for k := 0 to 15 do begin
  274. for h := 0 to 15 do begin
  275. if lvl3table3[lvl3table2[lvl3table1[i]][k]][h] <>
  276. GetPropID(256*i + 16*k +h,data)
  277. then begin
  278. writeln('3 levels errors, i=',i,'; k=',k,'; h=',h);
  279. end;
  280. end;
  281. end;
  282. end;
  283. binStreamNE.Clear();
  284. binStreamOE.Clear();
  285. WriteLn('Source generation ...', DateTimeToStr(Now));
  286. GenerateNumericTable(stream,numericTable,False);
  287. WriteLn('BMP Tables sources ...', DateTimeToStr(Now));
  288. Generate3lvlBmpTables(stream,lvl3table1,lvl3table2,lvl3table3);
  289. WriteLn('Properties Table sources ...', DateTimeToStr(Now));
  290. {tmpStream.Clear();
  291. GenerateNumericTable(tmpStream,numericTable,True);
  292. tmpStream.SaveToFile(outputPath + 'unicodenumtable.pas');
  293. tmpStream.Clear();}
  294. GeneratePropTable(binStreamNE,props,ENDIAN_NATIVE);
  295. GeneratePropTable(binStreamOE,props,ENDIAN_NON_NATIVE);
  296. //-------------------------------------------
  297. r := Compress(data);
  298. //-------------------
  299. WriteLn('OBMP Tables building ...', DateTimeToStr(Now));
  300. MakeOBmpTables3Levels(olvl3table1,olvl3table2,olvl3table3,r);
  301. WriteLn(' 3 Levels Tables :');
  302. WriteLn(' Len 1 = ',Length(olvl3table1));
  303. WriteLn(' Len 2 = ',Length(olvl3table2));
  304. WriteLn(' Len 3 = ',Length(olvl3table3));
  305. for i := 0 to 1023 do begin
  306. for k := 0 to 31 do begin
  307. for h := 0 to 31 do begin
  308. if olvl3table3[olvl3table2[olvl3table1[i]][k]][h] <>
  309. GetPropID(ToUCS4(HIGH_SURROGATE_BEGIN + i,LOW_SURROGATE_BEGIN + (k*32) + h),data)
  310. then begin
  311. writeln('3, OBMP levels errors, i=',i,'; k=',k,'; h=',h);
  312. end;
  313. end;
  314. end;
  315. end;
  316. WriteLn('OBMP Tables sources ...', DateTimeToStr(Now));
  317. Generate3lvlOBmpTables(stream,olvl3table1,olvl3table2,olvl3table3);
  318. //---------------------
  319. WriteLn('Decomposition Table sources ...', DateTimeToStr(Now));
  320. GenerateDecompositionBookTable(binStreamNE,decompositionBook,ENDIAN_NATIVE);
  321. GenerateDecompositionBookTable(binStreamOE,decompositionBook,ENDIAN_NON_NATIVE);
  322. stream.SaveToFile(outputPath + 'unicodedata.inc');
  323. binStreamNE.SaveToFile(outputPath + 'unicodedata_'+ENDIAN_SUFFIX[ENDIAN_NATIVE]+'.inc');
  324. binStreamOE.SaveToFile(outputPath + 'unicodedata_'+ENDIAN_SUFFIX[ENDIAN_NON_NATIVE]+'.inc');
  325. binStreamNE.Clear();
  326. binStreamOE.Clear();
  327. h := -1;
  328. for i := Low(data) to High(data) do
  329. if (data[i].CodePoint > $FFFF) then begin
  330. h := i;
  331. Break;
  332. end;
  333. stream.Clear();
  334. for i := h to High(data) do begin
  335. p := @data[i];
  336. if (p^.LineType = 0) then begin
  337. FromUCS4(p^.CodePoint,hs,ls);
  338. //k := GetProp(hs,ls,props,ofirstTable,osecondTable)^.PropID;
  339. k := GetProp(
  340. (hs-HIGH_SURROGATE_BEGIN),(ls-LOW_SURROGATE_BEGIN),
  341. props,olvl3table1,olvl3table2,olvl3table3
  342. )^.PropID;
  343. if (p^.PropID <> k) then begin
  344. s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
  345. stream.Write(s[1],Length(s));
  346. end;
  347. end else begin
  348. for h := p^.StartCodePoint to p^.EndCodePoint do begin
  349. FromUCS4(h,hs,ls);
  350. //k := GetProp(hs,ls,props,ofirstTable,osecondTable)^.PropID;
  351. k := GetProp(
  352. (hs-HIGH_SURROGATE_BEGIN),(ls-LOW_SURROGATE_BEGIN),
  353. props,olvl3table1,olvl3table2,olvl3table3
  354. )^.PropID;
  355. if (p^.PropID <> k) then begin
  356. s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
  357. stream.Write(s[1],Length(s));
  358. Break
  359. end;
  360. end;
  361. end;
  362. end;
  363. stream.SaveToFile(outputPath + 'diff-obmp.txt');
  364. stream.Clear();
  365. for i := Low(data) to High(data) do begin
  366. p := @data[i];
  367. if (p^.LineType = 0) then begin
  368. k := GetPropID(p^.CodePoint,r);
  369. if (p^.PropID <> k) then begin
  370. s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
  371. stream.Write(s[1],Length(s));
  372. end;
  373. end else begin
  374. for h := p^.StartCodePoint to p^.EndCodePoint do begin
  375. k := GetPropID(h,r);
  376. if (p^.PropID <> k) then begin
  377. s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
  378. stream.Write(s[1],Length(s));
  379. Break
  380. end;
  381. end;
  382. end;
  383. end;
  384. stream.SaveToFile(outputPath + 'diff.txt');
  385. stream.Clear();
  386. for i := Low(r) to High(r) do begin
  387. p := @r[i];
  388. if (p^.LineType = 0) then begin
  389. k := GetPropID(p^.CodePoint,data);
  390. if (p^.PropID <> k) then begin
  391. s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
  392. stream.Write(s[1],Length(s));
  393. end;
  394. end else begin
  395. for h := p^.StartCodePoint to p^.EndCodePoint do begin
  396. k := GetPropID(h,r);
  397. if (p^.PropID <> k) then begin
  398. s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
  399. stream.Write(s[1],Length(s));
  400. Break
  401. end;
  402. end;
  403. end;
  404. end;
  405. stream.SaveToFile(outputPath + 'diff2.txt');
  406. finally
  407. binaryStreamOE.Free();
  408. binaryStreamNE.Free();
  409. tmpStream.Free();
  410. binStreamOE.Free();
  411. binStreamNE.Free();
  412. stream.Free();
  413. end;
  414. end.