Kaynağa Gözat

* deduping #urlstr and #strings

git-svn-id: trunk@13767 -
marco 16 yıl önce
ebeveyn
işleme
27f0028dcc
2 değiştirilmiş dosya ile 131 ekleme ve 50 silme
  1. 14 10
      packages/chm/src/chmfilewriter.pas
  2. 117 40
      packages/chm/src/chmwriter.pas

+ 14 - 10
packages/chm/src/chmfilewriter.pas

@@ -26,10 +26,10 @@ interface
 
 uses
   Classes, SysUtils, chmwriter;
-  
+
 type
   TChmProject = class;
-  
+
   TChmProgressCB = procedure (Project: TChmProject; CurrentFile: String) of object;
 
   { TChmProject }
@@ -42,6 +42,7 @@ type
     FFiles: TStrings;
     FIndexFileName: String;
     FMakeBinaryTOC: Boolean;
+    FMakeBinaryIndex: Boolean;
     FMakeSearchable: Boolean;
     FFileName: String;
     FOnProgress: TChmProgressCB;
@@ -66,12 +67,13 @@ type
     property AutoFollowLinks: Boolean read FAutoFollowLinks write FAutoFollowLinks;
     property TableOfContentsFileName: String read FTableOfContentsFileName write FTableOfContentsFileName;
     property MakeBinaryTOC: Boolean read FMakeBinaryTOC write FMakeBinaryTOC;
+    property MakeBinaryIndex: Boolean read FMakeBinaryIndex write FMakeBinaryIndex;
     property Title: String read FTitle write FTitle;
     property IndexFileName: String read FIndexFileName write FIndexFileName;
     property MakeSearchable: Boolean read FMakeSearchable write FMakeSearchable;
     property DefaultPage: String read FDefaultPage write FDefaultPage;
     property DefaultFont: String read FDefaultFont write FDefaultFont;
-    
+
     property OnProgress: TChmProgressCB read FOnProgress write FOnProgress;
   end;
 
@@ -90,7 +92,7 @@ begin
   // clean up the filename
   FileName := StringReplace(ExtractFileName(DataName), '\', '/', [rfReplaceAll]);
   FileName := StringReplace(FileName, '//', '/', [rfReplaceAll]);
-  
+
   PathInChm := '/'+ExtractFilePath(DataName);
   if Assigned(FOnProgress) then FOnProgress(Self, DataName);
 end;
@@ -145,7 +147,7 @@ begin
   Cfg := TXMLConfig.Create(nil);
   Cfg.Filename := AFileName;
   FileName := AFileName;
-  
+
   Files.Clear;
   FileCount := Cfg.GetValue('Files/Count/Value', 0);
   for I := 0 to FileCount-1 do begin
@@ -153,8 +155,9 @@ begin
   end;
   IndexFileName := Cfg.GetValue('Files/IndexFile/Value','');
   TableOfContentsFileName := Cfg.GetValue('Files/TOCFile/Value','');
+  // For chm file merging, bintoc must be false and binindex true. Change defaults in time?
   MakeBinaryTOC := Cfg.GetValue('Files/MakeBinaryTOC/Value', True);
-  
+  MakeBinaryIndex:= Cfg.GetValue('Files/MakeBinaryIndex/Value', False);
   AutoFollowLinks := Cfg.GetValue('Settings/AutoFollowLinks/Value', False);
   MakeSearchable := Cfg.GetValue('Settings/MakeSearchable/Value', False);
   DefaultPage := Cfg.GetValue('Settings/DefaultPage/Value', '');
@@ -181,7 +184,7 @@ begin
   Cfg.SetValue('Files/IndexFile/Value', IndexFileName);
   Cfg.SetValue('Files/TOCFile/Value', TableOfContentsFileName);
   Cfg.SetValue('Files/MakeBinaryTOC/Value',MakeBinaryTOC);
-
+  Cfg.SetValue('Files/MakeBinaryIndex/Value',MakeBinaryIndex);
   Cfg.SetValue('Settings/AutoFollowLinks/Value', AutoFollowLinks);
   Cfg.SetValue('Settings/MakeSearchable/Value', MakeSearchable);
   Cfg.SetValue('Settings/DefaultPage/Value', DefaultPage);
@@ -212,7 +215,7 @@ begin
   // our callback to get data
   Writer.OnGetFileData := @GetData;
   Writer.OnLastFile    := @LastFileAdded;
-  
+
   // give it the list of files
   Writer.FilesToCompress.AddStrings(Files);
 
@@ -222,10 +225,11 @@ begin
   Writer.DefaultFont := DefaultFont;
   Writer.FullTextSearch := MakeSearchable;
   Writer.HasBinaryTOC := MakeBinaryTOC;
-  
+  Writer.HasBinaryIndex := MakeBinaryIndex;
+
   // and write!
   Writer.Execute;
-  
+
   if Assigned(TOCStream) then TOCStream.Free;
   if Assigned(IndexStream) then IndexStream.Free;
 end;

+ 117 - 40
packages/chm/src/chmwriter.pas

@@ -22,7 +22,7 @@ unit chmwriter;
 {$MODE OBJFPC}{$H+}
 
 interface
-uses Classes, ChmBase, chmtypes, chmspecialfiles, HtmlIndexer, chmsitemap;
+uses Classes, ChmBase, chmtypes, chmspecialfiles, HtmlIndexer, chmsitemap, Avl_Tree;
 
 type
 
@@ -33,6 +33,15 @@ type
   //  FileName :  /home/user/helpstuff/index.html > index.html
   //  Stream   :  the file opened with DataName should be written to this stream
 
+Type
+   TStringIndex = Class    // AVLTree needs wrapping in non automated reference type
+                      TheString : String;
+                      StrId     : Integer;
+                    end;
+   TUrlStrIndex = Class
+                      UrlStr    : String;
+                      UrlStrId  : Integer;
+                    end;
 
   { TChmWriter }
 
@@ -40,9 +49,9 @@ type
     FOnLastFile: TNotifyEvent;
   private
     FHasBinaryTOC: Boolean;
-  
+    FHasBinaryIndex: Boolean;
     ForceExit: Boolean;
-    
+
     FDefaultFont: String;
     FDefaultPage: String;
     FFullTextSearch: Boolean;
@@ -82,6 +91,10 @@ type
     HeaderSuffix: TITSFHeaderSuffix; //contains the offset of CONTENTSection0 from zero
     HeaderSection0: TITSPHeaderPrefix;
     HeaderSection1: TITSPHeader; // DirectoryListings header
+    FAvlStrings   : TAVLTree;    // dedupe strings
+    FAvlURLStr    : TAVLTree;    // dedupe urltbl + binindex must resolve URL to topicid
+    SpareString   : TStringIndex;
+    SpareUrlStr   : TUrlStrIndex;
     // DirectoryListings
     // CONTENT Section 0 (section 1 is contained in section 0)
     // EOF
@@ -138,7 +151,8 @@ type
     property FullTextSearch: Boolean read FFullTextSearch write FFullTextSearch;
     property SearchTitlesOnly: Boolean read FSearchTitlesOnly write FSearchTitlesOnly;
     property HasBinaryTOC: Boolean read FHasBinaryTOC write FHasBinaryTOC;
-    property DefaultFont: String read FDefaultFont write FDefaultFont;
+   property HasBinaryIndex: Boolean read FHasBinaryIndex write FHasBinaryIndex;
+   property DefaultFont: String read FDefaultFont write FDefaultFont;
     property DefaultPage: String read FDefaultPage write FDefaultPage;
     property TempRawStream: TStream read FTempStream write SetTempRawStream;
     //property LocaleID: dword read ITSFHeader.LanguageID write ITSFHeader.LanguageID;
@@ -148,12 +162,31 @@ implementation
 uses dateutils, sysutils, paslzxcomp, chmFiftiMain;
 
 const
-
   LZX_WINDOW_SIZE = 16; // 16 = 2 frames = 1 shl 16
   LZX_FRAME_SIZE = $8000;
 
 {$I chmobjinstconst.inc}
 
+
+Function CompareStrings(Node1, Node2: Pointer): integer;
+var n1,n2 : TStringIndex;
+begin
+  n1:=TStringIndex(Node1); n2:=TStringIndex(Node2);
+  Result := CompareText(n1.TheString, n2.TheString);
+  if Result < 0 then Result := -1
+  else if Result > 0 then Result := 1;
+end;
+
+
+Function CompareUrlStrs(Node1, Node2: Pointer): integer;
+var n1,n2 : TUrlStrIndex;
+begin
+  n1:=TUrlStrIndex(Node1); n2:=TUrlStrIndex(Node2);
+  Result := CompareText(n1.UrlStr, n2.UrlStr);
+  if Result < 0 then Result := -1
+  else if Result > 0 then Result := 1;
+end;
+
 { TChmWriter }
 
 procedure TChmWriter.InitITSFHeader;
@@ -179,10 +212,10 @@ begin
   // header section 1
   HeaderSection1Table.PosFromZero := HeaderSection0Table.PosFromZero + HeaderSection0Table.Length;
   HeaderSection1Table.Length := SizeOf(TITSPHeader)+FDirectoryListings.Size;
-  
+
   //contains the offset of CONTENT Section0 from zero
   HeaderSuffix.Offset := HeaderSection1Table.PosFromZero + HeaderSection1Table.Length;
-  
+
   // now fix endian stuff
   HeaderSection0Table.PosFromZero := NToLE(HeaderSection0Table.PosFromZero);
   HeaderSection0Table.Length := NToLE(HeaderSection0Table.Length);
@@ -209,7 +242,7 @@ begin
     //IndexOfRootChunk := -1;// if no root chunk
     //FirstPMGLChunkIndex,
     //LastPMGLChunkIndex: LongWord;
-    
+
     Unknown2 := NToLE(Longint(-1));
     //DirectoryChunkCount: LongWord;
     LanguageID := NToLE(DWord($0409));
@@ -219,7 +252,7 @@ begin
     Unknown4 := NToLE(Longint(-1));
     Unknown5 := NToLE(Longint(-1));
   end;
-  
+
   // more endian stuff
   HeaderSuffix.Offset := NToLE(HeaderSuffix.Offset);
 end;
@@ -289,7 +322,7 @@ const
     ParentIndex,
     TmpIndex: TPMGIDirectoryChunk;
   begin
-    with IndexHeader do 
+    with IndexHeader do
     begin
       PMGIsig := PMGI;
       UnusedSpace := NToLE(IndexBlock.FreeSpace);
@@ -298,11 +331,11 @@ const
     IndexBlock.WriteChunkToStream(FDirectoryListings, ChunkIndex, ShouldFinish);
     IndexBlock.Clear;
     if HeaderSection1.IndexOfRootChunk < 0 then HeaderSection1.IndexOfRootChunk := ChunkIndex;
-    if ShouldFinish then 
+    if ShouldFinish then
     begin
       HeaderSection1.IndexTreeDepth := 2;
       ParentIndex := IndexBlock.ParentChunk;
-      if ParentIndex <> nil then 
+      if ParentIndex <> nil then
       repeat // the parent index is notified by our child index when to write
         HeaderSection1.IndexOfRootChunk := ChunkIndex;
         TmpIndex := ParentIndex;
@@ -344,7 +377,7 @@ begin
   FInternalFiles.Sort;
   HeaderSection1.IndexTreeDepth := 1;
   HeaderSection1.IndexOfRootChunk := -1;
-  
+
   ChunkIndex := 0;
 
   IndexBlock := TPMGIDirectoryChunk.Create(SizeOf(TPMGIIndexChunk));
@@ -375,9 +408,9 @@ begin
 
     if not ListingBlock.CanHold(Size) then
       WriteListChunk;
-    
+
     ListingBlock.WriteEntry(Size, @Buffer[0]);
-    
+
     if ListingBlock.ItemCount = 1 then begin // add the first list item to the index
       Move(Buffer[0], FirstListEntry.Entry[0], FESize);
       FirstListEntry.Size := FESize + WriteCompressedInteger(@FirstListEntry.Entry[FESize], ChunkIndex);
@@ -395,7 +428,7 @@ begin
 
   IndexBlock.Free;
   ListingBlock.Free;
-  
+
   //now fix some endian stuff
   HeaderSection1.IndexOfRootChunk := NToLE(HeaderSection1.IndexOfRootChunk);
   HeaderSection1.IndexTreeDepth := NtoLE(HeaderSection1.IndexTreeDepth);
@@ -463,13 +496,13 @@ begin
   // two for a QWord
   FSection0.WriteDWord(0);
   FSection0.WriteDWord(0);
-  
+
   FSection0.WriteDWord(0);
   FSection0.WriteDWord(0);
 
-  
 
-  
+
+
   ////////////////////////<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
   // 2  default page to load
   if FDefaultPage <> '' then begin
@@ -493,14 +526,14 @@ begin
     FSection0.Write(FDefaultFont[1], Length(FDefaultFont));
     FSection0.WriteByte(0);
   end;
-  
+
   // 6
   // unneeded. if output file is :  /somepath/OutFile.chm the value here is outfile(lowercase)
   {FSection0.WriteWord(6);
   FSection0.WriteWord(Length('test1')+1);
   Fsection0.Write('test1', 5);
   FSection0.WriteByte(0);}
-  
+
   // 0 Table of contents filename
   if FHasTOC then begin
     TmpStr := 'default.hhc';
@@ -524,13 +557,13 @@ begin
   Entry.DecompressedSize := FSection0.Position - Entry.DecompressedOffset;
   FInternalFiles.AddEntry(Entry);
 
-  {// 7 Binary Index
+  // 7 Binary Index
   if FHasBinaryIndex then
   begin
     FSection0.WriteWord(NToLE(Word(7)));
     FSection0.WriteWord(NToLE(Word(4)));
     FSection0.WriteDWord(DWord(0)); // what is this number to be?
-  end;}
+  end;
 
   // 11 Binary TOC
   if FHasBinaryTOC then
@@ -551,7 +584,7 @@ begin
   Entry.Compressed := False;
   Entry.DecompressedOffset :=0;// FSection0.Position;
   Entry.DecompressedSize := 0;
-  
+
   FInternalFiles.AddEntry(Entry);
 end;
 
@@ -569,14 +602,13 @@ begin
   cnt:=pinteger(state)^;
   for i := 0 to AWord.DocumentCount-1 do
     Inc(cnt, AWord.GetLogicalDocument(i).NumberOfIndexEntries);
-          // was commented in original procedure, seems to list index entries per doc. 
+          // was commented in original procedure, seems to list index entries per doc.
             //WriteLn(AWord.TheWord,'             documents = ', AWord.DocumentCount, ' h
   pinteger(state)^:=cnt;
-end;  
+end;
 
 procedure TChmWriter.WriteTOPICS;
 var
-  AWord: TIndexedWord;
   FHits: Integer;
   i: Integer;
 begin
@@ -584,7 +616,7 @@ begin
     Exit;
   FTopicsStream.Position := 0;
   PostAddStreamToArchive('#TOPICS', '/', FTopicsStream);
- // I commented the code below since the result seemed unused 
+ // I commented the code below since the result seemed unused
  // FHits:=0;
  //   FIndexedFiles.ForEach(@IterateWord,FHits);
 end;
@@ -596,7 +628,7 @@ begin
   FContextStream.Position := 0;
   // the size of all the entries
   FContextStream.WriteDWord(NToLE(DWord(FContextStream.Size-SizeOf(dword))));
-  
+
   FContextStream.Position := 0;
   AddStreamToArchive('#IVB', '/', FContextStream);
 end;
@@ -802,7 +834,7 @@ begin
   Entry.Path := '::DataSpace/Storage/MSCompressed/';
   Entry.Name := 'ControlData';
   FInternalFiles.AddEntry(Entry, False);
-  
+
   //  ::DataSpace/Storage/MSCompressed/SpanInfo
   Entry.DecompressedOffset := FSection0.Position;
   Entry.DecompressedSize := WriteSpanInfoToStream(FSection0, FReadCompressedSize);
@@ -833,16 +865,24 @@ begin
   Entry.Name := 'Content';
   FInternalFiles.AddEntry(Entry, False);
 
-  
+
 end;
 
 function TChmWriter.AddString(AString: String): LongWord;
 var
   NextBlock: DWord;
   Pos: DWord;
+  n  : TAVLTreeNode;
+  StrRec : TStringIndex;
 begin
   // #STRINGS starts with a null char
   if FStringsStream.Size = 0 then FStringsStream.WriteByte(0);
+
+  SpareString.TheString:=AString;
+  n:=fAvlStrings.FindKey(SpareString,@CompareStrings);
+  if assigned(n) then
+   exit(TStringIndex(n.data).strid);
+
   // each entry is a null terminated string
   Pos := DWord(FStringsStream.Position);
 
@@ -857,11 +897,16 @@ begin
   Result := FStringsStream.Position;
   FStringsStream.WriteBuffer(AString[1], Length(AString));
   FStringsStream.WriteByte(0);
+
+  StrRec:=TStringIndex.Create;
+  StrRec.TheString:=AString;
+  StrRec.Strid    :=Result;
+  fAvlStrings.Add(StrRec);
 end;
 
 function TChmWriter.AddURL ( AURL: String; TopicsIndex: DWord ) : LongWord;
 
-  procedure CheckURLStrBlockCanHold(AString: String);
+  procedure CheckURLStrBlockCanHold(Const AString: String);
   var
     Rem: LongWord;
     Len: LongWord;
@@ -876,26 +921,48 @@ function TChmWriter.AddURL ( AURL: String; TopicsIndex: DWord ) : LongWord;
       end;
   end;
 
-  function AddURLString(AString: String): DWord;
+  function AddURLString(Const AString: String): DWord;
+  var urlstrrec : TUrlStrIndex;
   begin
     CheckURLStrBlockCanHold(AString);
     if FURLSTRStream.Size mod $4000 = 0 then
       FURLSTRStream.WriteByte(0);
       Result := FURLSTRStream.Position;
+      UrlStrRec:=TUrlStrIndex.Create;
+      UrlStrRec.UrlStr:=AString;
+      UrlStrRec.UrlStrid:=result;
+      FAvlUrlStr.Add(UrlStrRec);
       FURLSTRStream.WriteDWord(NToLE(DWord(0))); // URL Offset for topic after the the "Local" value
       FURLSTRStream.WriteDWord(NToLE(DWord(0))); // Offset of FrameName??
       FURLSTRStream.Write(AString[1], Length(AString));
       FURLSTRStream.WriteByte(0); //NT
   end;
+
+  function LookupUrlString(const AUrl : String):DWord;
+  var n :TAvlTreeNode;
+  begin
+    SpareUrlStr.UrlStr:=AUrl;
+    n:=FAvlUrlStr.FindKey(SpareUrlStr,@CompareUrlStrs);
+    if assigned(n) Then
+      result:=TUrlStrIndex(n.data).UrlStrId
+    else
+      result:=AddUrlString(AUrl);
+  end;
+
+
+var UrlIndex : Integer;
+
 begin
   if AURL[1] = '/' then Delete(AURL,1,1);
+  UrlIndex:=LookupUrlString(AUrl);
+
   //if $1000 - (FURLTBLStream.Size mod $1000) = 4 then // we are at 4092
   if FURLTBLStream.Size and $FFC = $FFC then // faster :)
     FURLTBLStream.WriteDWord(0);
   Result := FURLTBLStream.Position;
   FURLTBLStream.WriteDWord(0);//($231e9f5c); //unknown
   FURLTBLStream.WriteDWord(NtoLE(TopicsIndex)); // Index of topic in #TOPICS
-  FURLTBLStream.WriteDWord(NtoLE(AddURLString(AURL)));
+  FURLTBLStream.WriteDWord(NtoLE(UrlIndex));
 end;
 
 function _AtEndOfData(arg: pointer): LongBool; cdecl;
@@ -931,7 +998,7 @@ begin
       FileEntry.DecompressedSize := FCurrentStream.Size;
       FileEntry.DecompressedOffset := FReadCompressedSize; //269047723;//to test writing really large numbers
       FileEntry.Compressed := True;
-      
+
       if FullTextSearch then
         CheckFileMakeSearchable(FCurrentStream, FileEntry);
 
@@ -1074,6 +1141,11 @@ begin
   FDestroyStream := FreeStreamOnDestroy;
   FFileNames := TStringList.Create;
   FIndexedFiles := TIndexedWordList.Create;
+  FAvlStrings   := TAVLTree.Create(@CompareStrings);    // dedupe strings
+  FAvlURLStr    := TAVLTree.Create(@CompareUrlStrs);    // dedupe urltbl + binindex must resolve URL to topicid
+  SpareString   := TStringIndex.Create;                 // We need an object to search in avltree
+  SpareUrlStr   := TUrlStrIndex.Create;                 //    to avoid create/free circles we keep one in spare
+                                                        //    for searching purposes
 end;
 
 destructor TChmWriter.Destroy;
@@ -1093,6 +1165,12 @@ begin
   FDirectoryListings.Free;
   FFileNames.Free;
   FIndexedFiles.Free;
+  SpareString.free;
+  SpareUrlStr.free;
+  FAvlUrlStr.FreeAndClear;
+  FAvlUrlStr.Free;
+  FAvlStrings.FreeAndClear;
+  FAvlStrings.Free;
   inherited Destroy;
 end;
 
@@ -1104,12 +1182,12 @@ begin
 
   // write any internal files to FCurrentStream that we want in the compressed section
   WriteIVB;
-  
+
   // written to Section0 (uncompressed)
   WriteREADMEFile;
 
   WriteOBJINST;
-  
+
   // move back to zero so that we can start reading from zero :)
   FReadCompressedSize := FCurrentStream.Size;
   FCurrentStream.Position := 0;  // when compressing happens, first the FCurrentStream is read
@@ -1128,7 +1206,7 @@ begin
 
   //this creates all special files in the archive that start with ::DataSpace
   WriteDataSpaceFiles(FSection0);
-  
+
   // creates all directory listings including header
   CreateDirectoryListings;
 
@@ -1200,7 +1278,7 @@ begin
       FreeAndNil(NextLevelItems);
 
   while NextLevelItems <> nil do
-  begin  
+  begin
     CurrentLevelItems := NextLevelItems;
     NextLevelItems := TFPList.Create;
 
@@ -1314,7 +1392,6 @@ begin
   TOCIDXStream.Position := 0;
   AppendBinaryTOCStream(TOCIDXStream);
   TOCIDXStream.Free;
-
 end;
 
 procedure TChmWriter.AppendBinaryTOCStream(AStream: TStream);