123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434 |
- { Unicode tables parser.
- Copyright (c) 2012 by Inoussa OUEDRAOGO
- The source code is distributed under the Library GNU
- General Public License with the following modification:
- - object files and libraries linked into an application may be
- distributed without source code.
- If you didn't receive a copy of the file COPYING, contact:
- Free Software Foundation
- 675 Mass Ave
- Cambridge, MA 02139
- USA
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. }
- { This program generates tables as include-files for use
- with the unicode related sources. It expects the following
- unicode.org's files to be present in the same folder :
- * HangulSyllableType.txt
- * PropList.txt
- * UnicodeData.txt
- * allkeys.txt
- }
- {$DEFINE UCA_TEST}
- program unihelper;
- {$mode objfpc}{$H+}
- {$typedaddress on}
- uses
- SysUtils, Classes,
- helper, uca_test;
- const
- SUsage =
- 'This program generates tables as include-files for use ' + sLineBreak +
- ' with the unicode related sources. It expects the following ' + sLineBreak +
- ' unicode.org''s files to be present in the same folder : ' + sLineBreak +
- ' * HangulSyllableType.txt ' + sLineBreak +
- ' * PropList.txt ' + sLineBreak +
- ' * UnicodeData.txt ' + sLineBreak +
- ' * allkeys.txt : Note that this file is the one provided for the CLDR root.' + sLineBreak +
- '' + sLineBreak +
- 'Usage : unihelper [<dataDir> <outputDir>] ' + sLineBreak +
- ' where ' + sLineBreak +
- ' dataDir : the directory where are stored the unicode files. The default' + sLineBreak +
- ' value is the program''s directory.' + sLineBreak +
- ' outputDir : The directory where the generated files will be stored. The' + sLineBreak +
- ' default value is the program''s directory.'+sLineBreak;
- function DumpCodePoint(ACodePoint : TCodePointRec) : string;
- begin
- Result := '';
- if (ACodePoint.LineType = 0) then
- WriteStr(Result,IntToHex(ACodePoint.CodePoint,4))
- else
- WriteStr(Result,IntToHex(ACodePoint.StartCodePoint,4),'..',IntToHex(ACodePoint.EndCodePoint,4));
- end;
- var
- dataPath, outputPath : string;
- stream, binStreamNE, binStreamOE, tmpStream : TMemoryStream;
- binaryStreamNE, binaryStreamOE : TMemoryStream;
- hangulSyllables : TCodePointRecArray;
- ucaBook : TUCA_DataBook;
- ucaPropBook : PUCA_PropBook;
- propList : TPropListLineRecArray;
- whiteSpaceCodePoints : TCodePointRecArray;
- props : TPropRecArray;
- numericTable : TNumericValueArray;
- decomposition : TDecompositionArray;
- decompositionBook : TDecompositionBook;
- data : TDataLineRecArray;
- //----------------
- lvl3table1 : T3lvlBmp1Table;
- lvl3table2 : T3lvlBmp2Table;
- lvl3table3 : T3lvlBmp3Table;
- //----------------
- s : ansistring;
- i, k, h : Integer;
- p : PDataLineRec;
- r : TDataLineRecArray;
- olvl3table1 : T3lvlOBmp1Table;
- olvl3table2 : T3lvlOBmp2Table;
- olvl3table3 : T3lvlOBmp3Table;
- //----------------
- hs, ls : Word;
- ucaFirstTable : TucaBmpFirstTable;
- ucaSecondTable : TucaBmpSecondTable;
- ucaoFirstTable : TucaoBmpFirstTable;
- ucaoSecondTable : TucaOBmpSecondTable;
- WL : Integer;
- serializedHeader : TSerializedCollationHeader;
- begin
- WriteLn(SUsage+sLineBreak);
- if (ParamCount > 0) then
- dataPath := IncludeTrailingPathDelimiter(ParamStr(1))
- else
- dataPath := ExtractFilePath(ParamStr(0));
- if (ParamCount > 1) then
- outputPath := IncludeTrailingPathDelimiter(ParamStr(2))
- else
- outputPath := dataPath;
- if not DirectoryExists(outputPath) then begin
- WriteLn('Directory not found : ',outputPath);
- if ForceDirectories(outputPath) then begin
- WriteLn(' directory created successfully');
- end else begin
- WriteLn(' fail to create directory.');
- Halt(1);
- end;
- end;
- if not(
- FileExists(dataPath + 'HangulSyllableType.txt') and
- FileExists(dataPath + 'PropList.txt') and
- FileExists(dataPath + 'UnicodeData.txt') and
- FileExists(dataPath + 'allkeys.txt')
- )
- then begin
- WriteLn('File(s) not found : HangulSyllableType.txt or PropList.txt or UnicodeData.txt or allkeys.txt .');
- Halt(1);
- end;
- binaryStreamNE := nil;
- binaryStreamOE := nil;
- binStreamOE := nil;
- binStreamNE := nil;
- tmpStream := nil;
- stream := TMemoryStream.Create();
- try
- binStreamNE := TMemoryStream.Create();
- binStreamOE := TMemoryStream.Create();
- tmpStream := TMemoryStream.Create();
- WriteLn('Load file HangulSyllableType.txt ...', DateTimeToStr(Now));
- stream.LoadFromFile(dataPath + 'HangulSyllableType.txt');
- stream.Position := 0;
- hangulSyllables := nil;
- ParseHangulSyllableTypes(stream,hangulSyllables);
- stream.Clear();
- WriteLn('Load file PropList.txt ...', DateTimeToStr(Now));
- stream.LoadFromFile(dataPath + 'PropList.txt');
- stream.Position := 0;
- propList := nil;
- ParseProps(stream,propList);
- stream.Clear();
- whiteSpaceCodePoints := FindCodePointsByProperty('White_Space',propList);
- writeln(' PropList Length = ',Length(propList));
- writeln(' White_Space Length = ',Length(whiteSpaceCodePoints));
- for i := Low(whiteSpaceCodePoints) to High(whiteSpaceCodePoints) do
- WriteLn(' ',DumpCodePoint(whiteSpaceCodePoints[i]):12,' , IsWhiteSpace = ',IsWhiteSpace(whiteSpaceCodePoints[i].CodePoint,whiteSpaceCodePoints));
- WriteLn('Load file UnicodeData.txt ...', DateTimeToStr(Now));
- stream.LoadFromFile(dataPath + 'UnicodeData.txt');
- stream.Position := 0;
- WriteLn('Parse file ...', DateTimeToStr(Now));
- data := nil;
- props := nil;
- Parse_UnicodeData(stream,props,numericTable,data,decomposition,hangulSyllables,whiteSpaceCodePoints);
- WriteLn('Decomposition building ...');
- MakeDecomposition(decomposition,decompositionBook);
- WriteLn('Load file UCA allkeys.txt ...', DateTimeToStr(Now));
- stream.LoadFromFile(dataPath + 'allkeys.txt');
- stream.Position := 0;
- ParseUCAFile(stream,ucaBook);
- { $IFDEF UCA_TEST}
- k := 0; WL := 0; ;
- for i := 0 to Length(ucaBook.Lines) - 1 do begin
- h := GetPropID(ucaBook.Lines[i].CodePoints[0],data);
- if (h <> -1) and
- ({props[h].HangulSyllable or} (props[h].DecompositionID <> -1))
- then begin
- Inc(k);
- ucaBook.Lines[i].Stored := False;
- end else begin
- ucaBook.Lines[i].Stored := True;
- if Length(ucaBook.Lines[i].Weights) > WL then
- WL := Length(ucaBook.Lines[i].Weights);
- end;
- end;
- WriteLn(
- 'UCA, Version = ',ucaBook.Version,'; entries count = ',Length(ucaBook.Lines),' ; Hangul # = ',k,
- 'Max Weights Length = ',WL
- );
- { $ENDIF UCA_TEST}
- WriteLn('Construct UCA Property Book ...');
- ucaPropBook := nil;
- MakeUCA_Props(@ucaBook,ucaPropBook);
- {$IFDEF UCA_TEST}
- uca_CheckProp_1(ucaBook,ucaPropBook);
- uca_CheckProp_x(ucaBook,ucaPropBook);
- {$ENDIF UCA_TEST}
- WriteLn('Construct UCA BMP tables ...');
- MakeUCA_BmpTables(ucaFirstTable,ucaSecondTable,ucaPropBook);
- WriteLn(' UCA BMP Second Table Length = ',Length(ucaSecondTable));
- {$IFDEF UCA_TEST}
- uca_CheckProp_1y(ucaBook,ucaPropBook,@ucaFirstTable,@ucaSecondTable);
- {$ENDIF UCA_TEST}
- WriteLn('Construct UCA OBMP tables ...');
- MakeUCA_OBmpTables(ucaoFirstTable,ucaoSecondTable,ucaPropBook);
- WriteLn(' UCA OBMP Second Table Length = ',Length(ucaoSecondTable));
- {$IFDEF UCA_TEST}
- uca_CheckProp_2y(ucaBook,ucaPropBook,@ucaoFirstTable,@ucaoSecondTable);
- {$ENDIF UCA_TEST}
- binaryStreamNE := TMemoryStream.Create();
- binaryStreamOE := TMemoryStream.Create();
- WriteLn('Generate UCA Props tables ...');
- binStreamNE.Clear();
- binStreamOE.Clear();
- GenerateLicenceText(binStreamNE);
- GenerateLicenceText(binStreamOE);
- GenerateUCA_PropTable(binStreamNE,ucaPropBook,ENDIAN_NATIVE);
- GenerateUCA_PropTable(binStreamOE,ucaPropBook,ENDIAN_NON_NATIVE);
- WriteLn('Generate UCA BMP tables ...');
- stream.Clear();
- GenerateLicenceText(stream);
- GenerateUCA_Head(stream,@ucaBook,ucaPropBook);
- GenerateUCA_BmpTables(stream,binStreamNE,binStreamOE,ucaFirstTable,ucaSecondTable);
- WriteLn('Generate UCA OBMP tables ...');
- GenerateUCA_OBmpTables(stream,binStreamNE,binStreamOE,ucaoFirstTable,ucaoSecondTable);
- stream.SaveToFile(outputPath + 'ucadata.inc');
- s := outputPath + 'ucadata.inc';
- binStreamNE.SaveToFile(GenerateEndianIncludeFileName(s,ENDIAN_NATIVE));
- binStreamOE.SaveToFile(GenerateEndianIncludeFileName(s,ENDIAN_NON_NATIVE));
- binStreamNE.Clear();
- binStreamOE.Clear();
- // Binary DUCET
- FillChar(serializedHeader,SizeOf(TSerializedCollationHeader),0);
- serializedHeader.Version := ucaBook.Version;
- serializedHeader.CollationName := 'DUCET';//'Default Unicode Collation Element Table (DUCET)';
- serializedHeader.VariableWeight := Ord(ucaBook.VariableWeight);
- SetBit(serializedHeader.Backwards,0,ucaBook.Backwards[0]);
- SetBit(serializedHeader.Backwards,1,ucaBook.Backwards[1]);
- SetBit(serializedHeader.Backwards,2,ucaBook.Backwards[2]);
- SetBit(serializedHeader.Backwards,3,ucaBook.Backwards[3]);
- serializedHeader.BMP_Table1Length := Length(ucaFirstTable);
- serializedHeader.BMP_Table2Length := Length(TucaBmpSecondTableItem) *
- (Length(ucaSecondTable) * SizeOf(UInt24));
- serializedHeader.OBMP_Table1Length := Length(ucaoFirstTable) * SizeOf(Word);
- serializedHeader.OBMP_Table2Length := Length(TucaOBmpSecondTableItem) *
- (Length(ucaoSecondTable) * SizeOf(UInt24));
- serializedHeader.PropCount := ucaPropBook^.ItemSize;
- serializedHeader.VariableLowLimit := ucaPropBook^.VariableLowLimit;
- serializedHeader.VariableHighLimit := ucaPropBook^.VariableHighLimit;
- binaryStreamNE.Write(serializedHeader,SizeOf(serializedHeader));
- ReverseRecordBytes(serializedHeader);
- binaryStreamOE.Write(serializedHeader,SizeOf(serializedHeader));
- GenerateBinaryUCA_BmpTables(binaryStreamNE,binaryStreamOE,ucaFirstTable,ucaSecondTable);
- GenerateBinaryUCA_OBmpTables(binaryStreamNE,binaryStreamOE,ucaoFirstTable,ucaoSecondTable);
- GenerateBinaryUCA_PropTable(binaryStreamNE,binaryStreamOE,ucaPropBook);
- binaryStreamNE.SaveToFile(
- outputPath + Format('collation_ducet_%s.bco',[ENDIAN_SUFFIX[ENDIAN_NATIVE]])
- );
- binaryStreamOE.SaveToFile(
- outputPath + Format('collation_ducet_%s.bco',[ENDIAN_SUFFIX[ENDIAN_NON_NATIVE]])
- );
- // Binary DUCET - END
- stream.Clear();
- GenerateLicenceText(stream);
- WriteLn('File parsed ...', DateTimeToStr(Now));
- WriteLn(' Props Len = ',Length(props));
- WriteLn(' Data Len = ',Length(data));
- {WriteLn('BMP Tables building ...', DateTimeToStr(Now));
- MakeBmpTables(firstTable,secondTable,props,data);
- WriteLn(' First Table length = ',Length(firstTable));
- WriteLn(' Second Table length = ',Length(secondTable));}
- WriteLn('BMP Tables building ...', DateTimeToStr(Now));
- MakeBmpTables3Levels(lvl3table1,lvl3table2,lvl3table3,data);
- WriteLn(' 3 Levels Tables :');
- WriteLn(' Len 1 = ',Length(lvl3table1));
- WriteLn(' Len 2 = ',Length(lvl3table2));
- WriteLn(' Len 3 = ',Length(lvl3table3));
- for i := 0 to 255 do begin
- for k := 0 to 15 do begin
- for h := 0 to 15 do begin
- if lvl3table3[lvl3table2[lvl3table1[i]][k]][h] <>
- GetPropID(256*i + 16*k +h,data)
- then begin
- writeln('3 levels errors, i=',i,'; k=',k,'; h=',h);
- end;
- end;
- end;
- end;
- binStreamNE.Clear();
- binStreamOE.Clear();
- WriteLn('Source generation ...', DateTimeToStr(Now));
- WriteLn('BMP Tables sources ...', DateTimeToStr(Now));
- Generate3lvlBmpTables(stream,lvl3table1,lvl3table2,lvl3table3);
- WriteLn('Properties Table sources ...', DateTimeToStr(Now));
- tmpStream.Clear();
- GenerateNumericTable(tmpStream,numericTable,True);
- tmpStream.SaveToFile(outputPath + 'unicodenumtable.pas');
- tmpStream.Clear();
- GeneratePropTable(binStreamNE,props,ENDIAN_NATIVE);
- GeneratePropTable(binStreamOE,props,ENDIAN_NON_NATIVE);
- //-------------------------------------------
- r := Compress(data);
- //-------------------
- WriteLn('OBMP Tables building ...', DateTimeToStr(Now));
- MakeOBmpTables3Levels(olvl3table1,olvl3table2,olvl3table3,r);
- WriteLn(' 3 Levels Tables :');
- WriteLn(' Len 1 = ',Length(olvl3table1));
- WriteLn(' Len 2 = ',Length(olvl3table2));
- WriteLn(' Len 3 = ',Length(olvl3table3));
- for i := 0 to 1023 do begin
- for k := 0 to 31 do begin
- for h := 0 to 31 do begin
- if olvl3table3[olvl3table2[olvl3table1[i]][k]][h] <>
- GetPropID(ToUCS4(HIGH_SURROGATE_BEGIN + i,LOW_SURROGATE_BEGIN + (k*32) + h),data)
- then begin
- writeln('3, OBMP levels errors, i=',i,'; k=',k,'; h=',h);
- end;
- end;
- end;
- end;
- WriteLn('OBMP Tables sources ...', DateTimeToStr(Now));
- Generate3lvlOBmpTables(stream,olvl3table1,olvl3table2,olvl3table3);
- //---------------------
- WriteLn('Decomposition Table sources ...', DateTimeToStr(Now));
- GenerateDecompositionBookTable(binStreamNE,decompositionBook,ENDIAN_NATIVE);
- GenerateDecompositionBookTable(binStreamOE,decompositionBook,ENDIAN_NON_NATIVE);
- stream.SaveToFile(outputPath + 'unicodedata.inc');
- binStreamNE.SaveToFile(outputPath + 'unicodedata_'+ENDIAN_SUFFIX[ENDIAN_NATIVE]+'.inc');
- binStreamOE.SaveToFile(outputPath + 'unicodedata_'+ENDIAN_SUFFIX[ENDIAN_NON_NATIVE]+'.inc');
- binStreamNE.Clear();
- binStreamOE.Clear();
- h := -1;
- for i := Low(data) to High(data) do
- if (data[i].CodePoint > $FFFF) then begin
- h := i;
- Break;
- end;
- stream.Clear();
- for i := h to High(data) do begin
- p := @data[i];
- if (p^.LineType = 0) then begin
- FromUCS4(p^.CodePoint,hs,ls);
- //k := GetProp(hs,ls,props,ofirstTable,osecondTable)^.PropID;
- k := GetProp(
- (hs-HIGH_SURROGATE_BEGIN),(ls-LOW_SURROGATE_BEGIN),
- props,olvl3table1,olvl3table2,olvl3table3
- )^.PropID;
- if (p^.PropID <> k) then begin
- s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
- stream.Write(s[1],Length(s));
- end;
- end else begin
- for h := p^.StartCodePoint to p^.EndCodePoint do begin
- FromUCS4(h,hs,ls);
- //k := GetProp(hs,ls,props,ofirstTable,osecondTable)^.PropID;
- k := GetProp(
- (hs-HIGH_SURROGATE_BEGIN),(ls-LOW_SURROGATE_BEGIN),
- props,olvl3table1,olvl3table2,olvl3table3
- )^.PropID;
- if (p^.PropID <> k) then begin
- s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
- stream.Write(s[1],Length(s));
- Break
- end;
- end;
- end;
- end;
- stream.SaveToFile(outputPath + 'diff-obmp.txt');
- stream.Clear();
- for i := Low(data) to High(data) do begin
- p := @data[i];
- if (p^.LineType = 0) then begin
- k := GetPropID(p^.CodePoint,r);
- if (p^.PropID <> k) then begin
- s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
- stream.Write(s[1],Length(s));
- end;
- end else begin
- for h := p^.StartCodePoint to p^.EndCodePoint do begin
- k := GetPropID(h,r);
- if (p^.PropID <> k) then begin
- s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
- stream.Write(s[1],Length(s));
- Break
- end;
- end;
- end;
- end;
- stream.SaveToFile(outputPath + 'diff.txt');
- stream.Clear();
- for i := Low(r) to High(r) do begin
- p := @r[i];
- if (p^.LineType = 0) then begin
- k := GetPropID(p^.CodePoint,data);
- if (p^.PropID <> k) then begin
- s := Format('#%d-%d #%d',[p^.CodePoint,p^.PropID,k]) + sLineBreak;
- stream.Write(s[1],Length(s));
- end;
- end else begin
- for h := p^.StartCodePoint to p^.EndCodePoint do begin
- k := GetPropID(h,r);
- if (p^.PropID <> k) then begin
- s := Format('##%d;%d-%d #%d',[p^.StartCodePoint,p^.EndCodePoint,p^.PropID,k]) + sLineBreak;
- stream.Write(s[1],Length(s));
- Break
- end;
- end;
- end;
- end;
- stream.SaveToFile(outputPath + 'diff2.txt');
- finally
- binaryStreamOE.Free();
- binaryStreamNE.Free();
- tmpStream.Free();
- binStreamOE.Free();
- binStreamNE.Free();
- stream.Free();
- end;
- end.
|