123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092 |
- { Copyright (C) <2008> <Andrew Haines> chmfiftimain.pas
- This library is free software; you can redistribute it and/or modify it
- under the terms of the GNU Library General Public License as published by
- the Free Software Foundation; either version 2 of the License, or (at your
- option) any later version.
- This program is distributed in the hope that it will be useful, but WITHOUT
- ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
- for more details.
- You should have received a copy of the GNU Library General Public License
- along with this library; if not, write to the Free Software Foundation,
- Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- }
- {
- See the file COPYING.FPC, included in this distribution,
- for details about the copyright.
- }
- unit chmFiftiMain;
- {$mode objfpc}{$H+}
- interface
- uses Classes, HTMLIndexer;
- type
- TFiftiMainHeader = record
- Sig: array [0..3] of byte; //$00,$00,$28,$00
- HTMLFilesCount: DWord;
- RootNodeOffset: DWord;
- Unknown1: DWord; // = 0
- LeafNodeCount: DWord;
- CopyOfRootNodeOffset: DWord;
- TreeDepth: Word;
- Unknown2: DWord; // = 7
- DocIndexScale: Byte;
- DocIndexRootSize: Byte;
- CodeCountScale: Byte;
- CodeCountRootSize: Byte;
- LocationCodeScale: Byte;
- LocationCodeRootSize: Byte;
- Unknown3: array[0..9] of byte; // = 0
- NodeSize: DWord; // 4096;
- Unknown4: DWord; // 0 or 1;
- LastDupWordIndex: DWord;
- LastDupCharIndex: DWord;
- LongestWordLength: DWord; // maximum 99
- TotalWordsIndexed: DWord; // includes duplicates
- TotalWords: DWord; // word count not including duplicates
- TotalWordsLengthPart1: DWord; // length of all the words with duplicates plus the next dword!
- TotalWordsLengthPart2: DWord;
- TotalWordsLength: DWord; // length of all words not including duplicates
- WordBlockUnusedBytes: DWord; // who knows, this makes no sense when there are more than one blocks
- Unknown5: DWord; // 0
- HTMLFilesCountMinusOne: DWord; // maybe
- Unknown6: array[0..23] of Byte; // 0
- WindowsCodePage: DWord; // usually 1252
- LocalID: DWord;
- //Unknown7: array [0..893] of Byte; // 0
- end;
- { TFIftiNode }
- TFIftiNode = class(TObject)
- FLastWord: String;
- FWriteStream: TStream;
- FBlockStream: TMemoryStream;
- ParentNode: TFIftiNode;
- OwnsParentNode : boolean;
- function AdjustedWord(AWord: String; out AOffset: Byte; AOldWord: String): String;
- procedure ChildIsFull(AWord: String; ANodeOffset: DWord); virtual; abstract;
- function GuessIfCanHold(AWord: String): Boolean; virtual; abstract;
- procedure Flush(NewBlockNeeded: Boolean); virtual; abstract;
- procedure FillRemainingSpace;
- function RemainingSpace: DWord;
- constructor Create(AStream: TStream);
- destructor Destroy; override;
- end;
- { TChmSearchWriter }
- TChmSearchWriter = class(TObject)
- private
- FHeaderRec: TFiftiMainHeader;
- FStream: TStream;
- FWordList: TIndexedWordList;
- FActiveLeafNode: TFIftiNode;
- function GetHasData: Boolean;
- procedure ProcessWords;
- procedure WriteHeader(IsPlaceHolder: Boolean);
- procedure WriteAWord(AWord: TIndexedWord);
- public
- procedure WriteToStream;
- property HasData: Boolean read GetHasData;
- constructor Create(AStream: TStream; AWordList: TIndexedWordList);
- destructor Destroy; override;
- end;
- { TChmSearchReader }
- TChmWLCTopic = record
- TopicIndex: DWord;
- LocationCodes: array of DWord;
- end;
- TChmWLCTopicArray = array of TChmWLCTopic;
- TChmSearchReader = class;
- TChmSearchReaderFoundDataEvent = procedure(Sender: TChmSearchReader; AWord: String; ATopic: DWord; AWordIndex: DWord) of object;
- TChmSearchReader = class(TObject)
- private
- FStream: TStream;
- FFileIsValid: Boolean;
- FFreeStreamOnDestroy: Boolean;
- FDocRootSize,
- FCodeCountRootSize,
- FLocCodeRootSize: Integer;
- FTreeDepth: Integer;
- FRootNodeOffset: DWord;
- FActiveNodeStart: DWord;
- FActiveNodeFreeSpace: Word;
- FNextLeafNode: DWord;
- procedure ReadCommonData;
- procedure MoveToFirstLeafNode;
- procedure MoveToRootNode;
- procedure MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
- function ReadWordOrPartialWord(ALastWord: String): String; // returns the whole word using the last word as a base
- function ReadIndexNodeEntry(ALastWord: String; out AWord: String; out ASubNodeStart: DWord): Boolean;
- function ReadLeafNodeEntry(ALastWord: String; out AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out AWLCOffset: DWord; out AWLCSize: DWord): Boolean;
- function ReadWLCEntries(AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord): TChmWLCTopicArray;
- public
- constructor Create(AStream: TStream; AFreeStreamOnDestroy: Boolean);
- destructor Destroy; override;
- procedure DumpData(AFoundDataEvent: TChmSearchReaderFoundDataEvent);
- function LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray;
- property FileIsValid: Boolean read FFileIsValid;
- end;
- const
- FIFTI_NODE_SIZE = 4096;
- implementation
- uses SysUtils, Math, ChmBase;
- type
- { TIndexNode }
- TIndexNode = class(TFIftiNode)
- function GuessIfCanHold(AWord: String): Boolean; override;
- procedure ChildIsFull ( AWord: String; ANodeOffset: DWord ); override;
- procedure Flush(NewBlockNeeded: Boolean); override;
- end;
- { TLeafNode }
- TLeafNode = class(TFIftiNode)
- FLeafNodeCount: DWord;
- FLastNodeStart: DWord;
- FreeSpace: DWord;
- FDocRootSize,
- FCodeRootSize,
- FLocRootSize: Byte;
- procedure WriteInitialHeader;
- Destructor Destroy; override;
- function GuessIfCanHold(AWord: String): Boolean; override;
- procedure Flush(NewBlockNeeded: Boolean); override;
- procedure AddWord(AWord: TIndexedWord);
- function WriteWLCEntries(AWord: TIndexedWord; ADocRootSize, ACodeRootSize, ALocRootSize: Byte): DWord;
- property LeafNodeCount: DWord read FLeafNodeCount;
- property DocRootSize: Byte read FDocRootSize write FDocRootSize;
- property CodeRootSize: Byte read FCodeRootSize write FCodeRootSize;
- property LocRootSize: Byte read FLocRootSize write FLocRootSize;
- end;
- function GetCompressedIntegerBE(Stream: TStream): DWord;
- var
- Buf: Byte;
- Value: Dword = 0;
- Shift: Integer = 0;
- begin
- repeat
- Buf := Stream.ReadByte;
- Value := Value or (Buf and $7F) shl Shift;
- Inc(Shift, 7);
- until (Buf and $80) = 0;
- Result := Value;
- end;
- procedure WriteCompressedIntegerBE(Stream: TStream; AInt: DWord);
- var
- Bits: Integer;
- Tmp: DWord;
- Buf: Byte;
- begin
- Tmp := AInt;
- Bits := 0;
- while Tmp <> 0 do
- begin
- Tmp := Tmp shr 1;
- Inc(Bits);
- end;
- repeat
- Buf := (AInt shr (Tmp * 7)) and $7F;
- if Bits > 7 then
- Buf := Buf or $80;
- Dec(Bits, 7);
- Inc(Tmp);
- Stream.WriteByte(Buf);
- until Bits <= 0;
- end;
- function WriteScaleRootInt(ANumber: DWord; out Bits: DWord; Root: Integer): Byte;
- var
- Tmp: DWord;
- Mask: DWord;
- // Scale: Integer;
- NeededBits: Integer;
- PrefixBits: Integer;
- RootBits: Integer;
- begin
- // Scale := 2;
- Bits := 0;
- Result := Root;
- Tmp := ANumber;
- NeededBits := 0;
- while Tmp <> 0 do
- begin
- Inc(NeededBits);
- Tmp := Tmp shr 1;
- end;
- PrefixBits := Max(0, NeededBits-Root);
- RootBits := NeededBits -1; //
- if RootBits < Root then
- RootBits := Root;
- if RootBits < 0 then
- RootBits := 0;
- Mask := 0;
- if RootBits-1 >= 0 then
- for Tmp := 0 to RootBits-1 do
- Mask := Mask or (DWord(1) shl Tmp);
- Bits := not Mask;
- Bits := Bits shl 1; // make space for empty bit
- Bits := Bits or (ANumber and Mask);
- Result := PrefixBits + 1 + RootBits;
- Bits := (Bits shl (32-Result)) shr (32 - Result);
- end;
- { TChmSearchWriter }
- procedure TChmSearchWriter.ProcessWords;
- begin
- FWordList.ForEach(@WriteAword);
- if FActiveLeafNode <> nil then
- FActiveLeafNode.Flush(False); // causes the unwritten parts of the tree to be written
- end;
- function TChmSearchWriter.GetHasData: Boolean;
- begin
- Result := FWordList.IndexedFileCount > 0;
- end;
- procedure TChmSearchWriter.WriteHeader ( IsPlaceHolder: Boolean ) ;
- var
- TmpNode: TFIftiNode;
- i: Integer;
- begin
- if IsPlaceHolder then
- begin
- FStream.Size := $400; // the header size. we will fill this after the nodes have been determined
- FStream.Position := $400;
- FillChar(PChar(TMemoryStream(FStream).Memory)^, $400, 0);
- FHeaderRec.DocIndexRootSize := 1;
- FHeaderRec.CodeCountRootSize := 1;
- FHeaderRec.LocationCodeRootSize := 4;
- Exit;
- end;
- // write the glorious header
- FHeaderRec.Sig[2] := $28;
- FHeaderRec.HTMLFilesCount := FWordList.IndexedFileCount;
- FHeaderRec.RootNodeOffset := FStream.Size-FIFTI_NODE_SIZE;
- FHeaderRec.LeafNodeCount := TLeafNode(FActiveLeafNode).LeafNodeCount;
- FHeaderRec.CopyOfRootNodeOffset := FHeaderRec.RootNodeOffset;
- FHeaderRec.TreeDepth := 0;
- TmpNode := FActiveLeafNode;
- while TmpNode <> nil do
- begin
- Inc(FHeaderRec.TreeDepth);
- TmpNode := TmpNode.ParentNode;
- end;
- FHeaderRec.DocIndexScale := 2;
- FHeaderRec.CodeCountScale := 2;
- FHeaderRec.LocationCodeScale := 2;
- //FHeaderRec.DocIndexRootSize := 15;
- //FHeaderRec.CodeCountRootSize := 15;
- //FHeaderRec.LocationCodeRootSize := 15;
- FHeaderRec.NodeSize := FIFTI_NODE_SIZE;
- FHeaderRec.LongestWordLength := FWordList.LongestWord;
- FHeaderRec.TotalWordsIndexed := FWordList.TotalWordCount;
- FHeaderRec.TotalWords := FWordList.TotalDIfferentWords;
- FHeaderRec.TotalWordsLengthPart1 := FWordList.TotalWordLength;
- FHeaderRec.TotalWordsLength := FWordList.TotalDifferentWordLength;
- FHeaderRec.WindowsCodePage := 1252;
- FStream.Position := 0;
- FStream.Write(FHeaderRec.Sig[0], 4);
- FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount));
- FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset));
- FStream.WriteDWord(NtoLE(0)); // unknown 1
- FStream.WriteDWord(NtoLE(FHeaderRec.LeafNodeCount));
- FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset)); // yes twice
- FStream.WriteWord(NtoLE(FHeaderRec.TreeDepth));
- FStream.WriteDWord(NtoLE(DWord(7)));
- FStream.WriteByte(2);
- FStream.WriteByte(FHeaderRec.DocIndexRootSize);
- FStream.WriteByte(2);
- FStream.WriteByte(FHeaderRec.CodeCountRootSize);
- FStream.WriteByte(2);
- FStream.WriteByte(FHeaderRec.LocationCodeRootSize);
- // eat 10 bytes
- FStream.WriteWord(0);
- FStream.WriteDWord(0);
- FStream.WriteDWord(0);
- FStream.WriteDWord(NtoLE(FHeaderRec.NodeSize));
- FStream.WriteDWord(NtoLE(DWord(0)));
- FStream.WriteDWord(1);
- FStream.WriteDWord(5);
- FStream.WriteDWord(NtoLE(FHeaderRec.LongestWordLength));
- FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsIndexed));
- FStream.WriteDWord(NtoLE(FHeaderRec.TotalWords));
- FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart1));
- FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart2));
- FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLength));
- FStream.WriteDWord(NtoLE(TLeafNode(FActiveLeafNode).FreeSpace));
- FStream.WriteDWord(NtoLE(0));
- FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount-1));
- for i := 0 to 23 do FStream.WriteByte(0);
- FStream.WriteDWord(NtoLE(FHeaderRec.WindowsCodePage));
- FStream.WriteDWord(NtoLE(DWord(1033))); // LCID
- for i := 0 to 893 do FStream.WriteByte(0);
- end;
- procedure TChmSearchWriter.WriteAWord ( AWord: TIndexedWord ) ;
- begin
- if FActiveLeafNode = nil then
- begin
- FActiveLeafNode := TLeafNode.Create(FStream);
- with TLeafNode(FActiveLeafNode) do
- begin
- DocRootSize := FHeaderRec.DocIndexRootSize;
- CodeRootSize := FHeaderRec.CodeCountRootSize;
- LocRootSize := FHeaderRec.LocationCodeRootSize;
- end;
- end;
- if FActiveLeafNode.GuessIfCanHold(AWord.TheWord) = False then
- begin
- FActiveLeafNode.Flush(True);
- end;
- TLeafNode(FActiveLeafNode).AddWord(AWord);
- end;
- procedure TChmSearchWriter.WriteToStream;
- begin
- WriteHeader(True);
- ProcessWords;
- WriteHeader(False);
- end;
- constructor TChmSearchWriter.Create ( AStream: TStream;
- AWordList: TIndexedWordList ) ;
- begin
- FStream := AStream;
- FWordList := AWordList;
- FActiveLeafNode:=NIL;
- end;
- destructor TChmSearchWriter.Destroy;
- begin
- freeandnil(FActiveLeafNode);
- end;
- { TLeafNode }
- function TFIftiNode.RemainingSpace: DWord;
- begin
- Result := FIFTI_NODE_SIZE - FBlockStream.Position;
- end;
- constructor TFIftiNode.Create ( AStream: TStream ) ;
- begin
- inherited Create;
- FWriteStream := AStream;
- FBlockStream := TMemoryStream.Create;
- OwnsParentNode :=false;
- end;
- destructor TFIftiNode.Destroy;
- begin
- FBlockStream.Free;
- if OwnsParentNode then ParentNode.Free;
- inherited Destroy;
- end;
- procedure TFIftiNode.FillRemainingSpace;
- begin
- while RemainingSpace > 0 do
- FBlockStream.WriteByte(0);
- end;
- function TFIftiNode.AdjustedWord ( AWord: String; out AOffset: Byte; AOldWord: String ) : String;
- var
- Count1,
- Count2: Integer;
- Count: Integer;
- i: Integer;
- begin
- if AWord = AOldWord then
- begin
- AOffset := Length(AWord);
- Exit('');
- end;
- // else
- Count1 := Length(AOldWord);
- Count2 := Length(AWord);
- if Count1<Count2 then
- Count := Count1
- else
- Count := Count2;
- for i := 1 to Count do
- begin
- AOffset := i-1;
- if AOldWord[i] <> AWord[i]
- then Exit(Copy(AWord, i, Length(AWord)));
- end;
- Result := AWord;
- AOffset := 0;
- end;
- procedure TLeafNode.WriteInitialHeader;
- begin
- FBlockStream.WriteDWord(0);
- FBlockStream.WriteWord(0);
- FBlockStream.WriteWord(0);
- end;
- destructor TLeafNode.Destroy;
- begin
- inherited Destroy;
- end;
- function TLeafNode.GuessIfCanHold ( AWord: String ) : Boolean;
- var
- WordOffset: Byte;
- begin
- Result := 17 + Length(AdjustedWord(AWord, WordOffset, FLastWord)) < RemainingSpace;
- end;
- procedure TLeafNode.Flush(NewBlockNeeded: Boolean);
- var
- FTmpPos: DWord;
- begin
- Inc(FLeafNodeCount);
- FTmpPos := FWriteStream.Position;
- // update the previous leaf node about our position.
- if FLastNodeStart > 0 then
- begin
- FWriteStream.Position := FLastNodeStart;
- FWriteStream.WriteDWord(NtoLE(FTmpPos));
- FWriteStream.Position := FTmpPos;
- end;
- FLastNodeStart := FTmpPos;
- FreeSpace := RemainingSpace;
- FillRemainingSpace;
- // update the leaf header to show the available space.
- FBlockStream.Position := 6;
- FBlockStream.WriteWord(NtoLE(Word(FreeSpace)));
- // copy the leaf block to the fiftimain file
- FBlockStream.Position := 0;
- FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
- FBlockStream.Position := 0;
- if NewBlockNeeded or ((NewBlockNeeded = False) and (ParentNode <> nil)) then
- begin
- if ParentNode = nil then
- begin
- ParentNode := TIndexNode.Create(FWriteStream);
- OwnsParentNode:=True;
- end;
- ParentNode.ChildIsFull(FLastWord, FLastNodeStart);
- if (NewBlockNeeded = False) then
- ParentNode.Flush(False);
- end;
- FLastWord := '';
- end;
- procedure TLeafNode.AddWord ( AWord: TIndexedWord ) ;
- var
- Offset: Byte;
- NewWord: String;
- WLCSize: DWord;
- begin
- if Length(AWord.TheWord) > 99 then
- Exit; // Maximum word length is 99
- if FBlockStream.Position = 0 then
- WriteInitialHeader;
- NewWord := AdjustedWord(AWord.TheWord, Offset, FLastWord);
- FLastWord := AWord.TheWord;
- FBlockStream.WriteByte(Length(NewWord)+1);
- FBlockStream.WriteByte(Offset);
- // length can be 0 if it is the same word as the last. there is a word entry each for title and content
- if Length(NewWord) > 0 then
- FBlockStream.Write(NewWord[1], Length(NewWord));
- FBlockStream.WriteByte(Ord(AWord.IsTitle));
- WriteCompressedIntegerBE(FBlockStream, AWord.DocumentCount);
- FBlockStream.WriteDWord(NtoLE(DWord(FWriteStream.Position)));
- FBlockStream.WriteWord(0);
- // write WLC to FWriteStream so we can write the size of the wlc entries
- WLCSize := WriteWLCEntries(AWord, FDocRootSize, FCodeRootSize, FLocRootSize);
- WriteCompressedIntegerBE(FBlockStream, WLCSize);
- if FBlockStream.Position > FIFTI_NODE_SIZE then
- raise Exception.Create('FIFTIMAIN Leaf node has written past the block!');
- end;
- function Min(AValue, BValue: Byte): Byte;
- begin
- if AValue < BValue then
- Result := AValue
- else Result := BValue;
- end;
- function Max(AValue, BValue: Byte): Byte;
- begin
- if AValue > BValue then
- Result := AValue
- else Result := BValue;
- end;
- function Max(AValue, BValue: Integer): Integer;
- begin
- if AValue > BValue then
- Result := AValue
- else Result := BValue;
- end;
- function Max(AValue, BValue: DWord): DWord;
- begin
- if AValue > BValue then
- Result := AValue
- else Result := BValue;
- end;
- function TLeafNode.WriteWLCEntries ( AWord: TIndexedWord ; ADocRootSize, ACodeRootSize, ALocRootSize: Byte) : DWord;
- var
- LastDocIndex: DWord;
- LastLocCode: DWord;
- UsedBits: Byte;
- Buf: Byte;
- function NewDocDelta(ADocIndex: DWord): DWord;
- begin
- Result := ADocIndex - LastDocIndex;
- LastDocIndex := ADocIndex;
- end;
- function NewLocCode(ALocCode: DWord): DWord;
- begin
- Result := ALocCode - LastLocCode;
- LastLocCode := ALocCode;
- end;
- procedure AddValue(AValue: DWord; BitCount: Byte);
- var
- NeededBits: Byte;
- Tmp: Byte;
- begin
- AValue := AValue shl (32 - BitCount);
- while BitCount > 0 do
- begin
- NeededBits := 8 - UsedBits;
- Tmp := Hi(Hi(DWord(AValue shr (UsedBits))));
- Buf := Buf or Tmp;
- Inc(UsedBits, Min(BitCount, NeededBits));
- AValue := AValue shl Min(BitCount, NeededBits);
- Dec(BitCount, Min(BitCount, NeededBits));
- if (UsedBits = 8) then
- begin
- FWriteStream.WriteByte(Buf);
- UsedBits := 0;
- NeededBits := 0;
- Buf := 0;
- end;
- end;
- end;
- procedure FlushBuffer;
- begin
- if UsedBits > 0 then
- FWriteStream.WriteByte(Buf);
- UsedBits := 0;
- Buf := 0;
- end;
- var
- DocDelta: DWord;
- LocDelta: DWord;
- StartPos: DWord;
- Bits: DWord;
- BitCount: Byte;
- i,
- j: Integer;
- Doc: TIndexDocument;
- // proced
- begin
- StartPos := FWriteStream.Position;
- LastDocIndex := 0;
- UsedBits := 0;
- Buf := 0;
- for i := 0 to AWord.DocumentCount-1 do
- begin
- LastLocCode := 0;
- Doc := AWord.GetLogicalDocument(i);
- DocDelta := NewDocDelta(Doc.DocumentIndex);
- BitCount := WriteScaleRootInt(DocDelta, Bits, ADocRootSize);
- AddValue(Bits, BitCount);
- BitCount := WriteScaleRootInt(Doc.NumberOfIndexEntries, Bits, ACodeRootSize);
- AddValue(Bits, BitCount);
- for j := 0 to Doc.NumberOfIndexEntries-1 do
- begin
- LocDelta := NewLocCode(Doc.IndexEntry[j]);
- BitCount := WriteScaleRootInt(LocDelta, Bits, ALocRootSize);
- AddValue(Bits, BitCount);
- end;
- FlushBuffer;
- end;
- Result := FWriteStream.Position-StartPos;
- end;
- { TIndexNode }
- function TIndexNode.GuessIfCanHold ( AWord: String ) : Boolean;
- var
- Offset: Byte;
- begin
- Result := FBlockStream.Position + 8 + Length(AdjustedWord(AWord, Offset, FLastWord)) < FIFTI_NODE_SIZE;
- end;
- procedure TIndexNode.ChildIsFull ( AWord: String; ANodeOffset: DWord ) ;
- var
- Offset: Byte;
- NewWord: String;
- begin
- if FBlockStream.Position = 0 then
- FBlockStream.WriteWord(0); // free space at end. updated when the block is flushed
- if GuessIfCanHold(AWord) = False then
- Flush(True);
- NewWord := AdjustedWord(AWord, Offset, FLastWord);
- FLastWord:=AWord;
- // Write the Index node Entry
- FBlockStream.WriteByte(Length(NewWord)+1);
- FBlockStream.WriteByte(Offset);
- FBlockStream.Write(NewWord[1], Length(NewWord));
- FBlockStream.WriteDWord(NtoLE(ANodeOffset));
- FBlockStream.WriteWord(0);
- if FBlockStream.Position > FIFTI_NODE_SIZE then
- raise Exception.Create('FIFTIMAIN Index node has written past the block!');
- end;
- procedure TIndexNode.Flush ( NewBlockNeeded: Boolean ) ;
- var
- RemSize: DWord;
- begin
- if NewBlockNeeded then
- begin
- if ParentNode = nil then
- begin
- ParentNode := TIndexNode.Create(FWriteStream);
- OwnsParentNode:=True;
- end;
- end;
- if ParentNode <> nil then
- ParentNode.ChildIsFull(FLastWord, FWriteStream.Position);
- RemSize := RemainingSpace;
- FillRemainingSpace;
- FBlockStream.Position := 0;
- FBlockStream.WriteWord(NtoLE(RemSize));
- FBlockStream.Position := 0;
- FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
- FBlockStream.Position := 0;
- FLastWord := '';
- if NewBlockNeeded then
- FBlockStream.WriteDWord(0) // placeholder to write free space in when block is full
- else
- if ParentNode <> nil then
- ParentNode.Flush(NewBlockNeeded);
- end;
- { TChmSearchReader }
- procedure TChmSearchReader.ReadCommonData;
- var
- Sig: DWord;
- begin
- FStream.Position := 0;
- Sig := LEtoN(FStream.ReadDWord);
- FFileIsValid := Sig = $00280000;
- if not FileIsValid then
- Exit;
- // root node address
- FStream.Position := $8;
- FRootNodeOffset := LEtoN(FStream.ReadDWord);
- // Tree Depth
- FStream.Position := $18;
- FTreeDepth := LEtoN(FStream.ReadWord);
- // Root sizes for scale and root integers
- FStream.Position := $1E;
- if FStream.ReadByte <> 2 then // we only can read the files when scale is 2
- FFileIsValid := False;
- FDocRootSize := FStream.ReadByte;
- if FStream.ReadByte <> 2 then
- FFileIsValid := False;
- FCodeCountRootSize := FStream.ReadByte;
- if FStream.ReadByte <> 2 then
- FFileIsValid := False;
- FLocCodeRootSize := FStream.ReadByte;
- end;
- procedure TChmSearchReader.MoveToFirstLeafNode;
- var
- NodeDepth: Integer;
- NodeOffset: DWord;
- LastWord: String;
- NewWord: String;
- begin
- NodeDepth := FTreeDepth;
- MoveToRootNode;
- while NodeDepth > 1 do
- begin
- LastWord := '';
- ReadIndexNodeEntry(LastWord, NewWord, NodeOffset);
- Dec(NodeDepth);
- MoveToNode(NodeOffset, NodeDepth);
- end;
- end;
- procedure TChmSearchReader.MoveToRootNode;
- begin
- MoveToNode(FRootNodeOffset, FTreeDepth);
- end;
- procedure TChmSearchReader.MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
- begin
- FStream.Position := ANodeOffset;
- FActiveNodeStart := FStream.Position;
- if ANodeDepth > 1 then
- begin
- FnextLeafNode := 0;
- FActiveNodeFreeSpace := LEtoN(FStream.ReadWord); // empty space at end of node
- end
- else
- begin
- FnextLeafNode := LEtoN(FStream.ReadDWord);
- FStream.ReadWord;
- FActiveNodeFreeSpace := LEtoN(FStream.ReadWord);
- end;
- end;
- function TChmSearchReader.ReadWordOrPartialWord ( ALastWord: String ) : String;
- var
- WordLength: Integer;
- CopyLastWordCharCount: Integer;
- begin
- WordLength := FStream.ReadByte;
- CopyLastWordCharCount := FStream.ReadByte;
- if CopyLastWordCharCount > 0 then
- Result := Copy(ALastWord, 1, CopyLastWordCharCount);
- SetLength(Result, (WordLength-1) + CopyLastWordCharCount);
- FStream.Read(Result[1+CopyLastWordCharCount], WordLength-1);
- end;
- function TChmSearchReader.ReadIndexNodeEntry (ALastWord: String; out AWord: String; out
- ASubNodeStart: DWord ): Boolean;
- begin
- Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
- if not Result then
- Exit;
- AWord := ReadWordOrPartialWord(ALastWord);
- ASubNodeStart := LEtoN(FStream.ReadDWord);
- FStream.ReadWord;
- end;
- function TChmSearchReader.ReadLeafNodeEntry ( ALastWord: String; out
- AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out
- AWLCOffset: DWord; out AWLCSize: DWord ): Boolean;
- begin
- Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
- if not Result then
- Exit;
- AWord := ReadWordOrPartialWord(ALastWord);
- AInTitle := FStream.ReadByte = 1;
- AWLCCount := GetCompressedIntegerBE(FStream);
- AWLCOffset := LEtoN(FStream.ReadDWord);
- FStream.ReadWord;
- AWLCSize := GetCompressedIntegerBE(FStream);
- end;
- function TChmSearchReader.ReadWLCEntries (AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord ) : TChmWLCTopicArray;
- function AtEndOfWLCEntries: Boolean;
- begin
- Result := AWLCOffset + AWLCSize <= FStream.Position;
- end;
- var
- Buf: Byte;
- BitsInBuffer: Integer;
- procedure FillBuffer;
- begin
- while (BitsInBuffer = 0) and not AtEndOfWLCEntries do
- begin
- Buf := FStream.ReadByte;
- Inc(BitsInBuffer, 8);
- end;
- end;
- function ReadWLC(RootSize: DWord): DWord;
- var
- PrefixBits: Integer = 0;
- BitCount: Integer = 0;
- RemainingBits: Integer; // only the bits for this number not the bits in buffer
- begin
- FillBuffer;
- Result := 0;
- while (Buf and $80) > 0 do // find out how many prefix bits there are
- begin
- Inc(PrefixBits);
- Buf := Buf shl 1;
- Dec(BitsInBuffer);
- FillBuffer;
- end;
- if PrefixBits > 0 then
- Result := 1;
- Inc(BitCount, PrefixBits+1);
- Buf := Buf shl 1;
- Dec(BitsInBuffer);
- FillBuffer;
- Remainingbits := RootSize + Max(Integer(PrefixBits-1), 0);
- while RemainingBits > 0 do
- begin
- Result := Result shl 1;
- Result := Result or (Buf shr 7);
- Dec(RemainingBits);
- Buf := Buf shl 1;
- Dec(BitsInBuffer);
- FillBuffer;
- Inc(BitCount);
- end;
- end;
- procedure ClearBuffer;
- begin
- BitsInBuffer := 0;
- Buf := 0;
- end;
- var
- TopicHits: DWord;
- i: Integer;
- j: Integer;
- CachedStreamPos: QWord;
- LastDoc,
- LastLocCode: DWord;
- begin
- CachedStreamPos := FStream.Position;
- FStream.Position := AWLCOffset;
- {for i := 0 to AWLCSize-1 do
- begin
- Buf := FStream.ReadByte;
- Write(binStr(Buf, 8), ' ');
- end;}
- FStream.Position := AWLCOffset;
- SetLength(Result, AWLCCount);
- Buf := 0;
- BitsInBuffer := 0;
- LastDoc := 0;
- for i := 0 to AWLCCount-1 do
- begin
- Result[i].TopicIndex := ReadWLC(FDocRootSize) + LastDoc;
- LastDoc := Result[i].TopicIndex;
- TopicHits := ReadWLC(FCodeCountRootSize);
- SetLength(Result[i].LocationCodes, TopicHits);
- LastLocCode := 0;
- for j := 0 to TopicHits-1 do
- begin
- Result[i].LocationCodes[j] := ReadWLC(FLocCodeRootSize) + LastLocCode;
- LastLocCode := Result[i].LocationCodes[j];
- end;
- ClearBuffer;
- end;
- FStream.Position := CachedStreamPos;
- end;
- constructor TChmSearchReader.Create ( AStream: TStream;
- AFreeStreamOnDestroy: Boolean ) ;
- begin
- FStream := AStream;
- FFreeStreamOnDestroy := AFreeStreamOnDestroy;
- ReadCommonData;
- end;
- destructor TChmSearchReader.Destroy;
- begin
- if FFreeStreamOnDestroy then
- FreeAndNil(FStream);
- inherited Destroy;
- end;
- procedure TChmSearchReader.DumpData (
- AFoundDataEvent: TChmSearchReaderFoundDataEvent ) ;
- var
- LastWord: String;
- TheWord: String;
- InTitle: Boolean;
- WLCCount: DWord;
- WLCOffset: DWord;
- WLCSize: DWord;
- FoundHits: TChmWLCTopicArray;
- i: Integer;
- j: Integer;
- begin
- MoveToFirstLeafNode;
- LastWord := '';
- repeat
- if (ReadLeafNodeEntry(LastWord, TheWord, InTitle, WLCCount, WLCOffset, WLCSize) = False) then
- begin
- if FnextLeafNode <> 0 then
- begin
- MoveToNode(FnextLeafNode, 1);
- LastWord := '';
- end
- else
- Break;
- end
- else begin
- LastWord := TheWord;
- //WriteLn('Reading Hits for ', TheWord ,' at ', hexstr(WLCOffset,8) );
- FoundHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
- //WriteLn('DONE Reading Hits for ', TheWord);
- // AFoundDataEvent(Self, TheWord, 0,0);//FoundHits[i].TopicIndex ,-1);//FoundHits[i].LocationCodes[j]);
- for i := 0 to High(FoundHits) do
- for j := 0 to High(FoundHits[i].LocationCodes) do
- AFoundDataEvent(Self, TheWord, FoundHits[i].TopicIndex ,FoundHits[i].LocationCodes[j]);
- end;
- until False; //FStream.Position - FActiveNodeStart >= FIFTI_NODE_SIZE - FActiveNodeFreeSpace
- end;
- function TChmSearchReader.LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray;
- var
- LastWord: String;
- NewWord: String;
- NodeLevel: Integer;
- NewNodePosition: DWord;
- InTitle: Boolean;
- WLCCount: DWord;
- WLCOffset: DWord;
- WLCSize: DWord;
- CompareResult: Integer;
- ReadNextResult: Boolean;
- begin
- AWord := LowerCase(AWord);
- NodeLevel := FTreeDepth;
- MoveToRootNode;
- SetLength(Result, 0);
- LastWord := '';
- // descend the index node tree until we find the leafnode
- while NodeLevel > 1 do begin
- //WriteLn('At Node Level ', NodeLevel);
- if ReadIndexNodeEntry(LastWord, NewWord, NewNodePosition) <> False then
- begin
- LastWord := NewWord;
- //WriteLn('Found Index Entry: ', NewWord, ' Comparing to ', AWord);
- if ChmCompareText(NewWord, AWord) >= 0 then
- begin
- LastWord := '';
- Dec(NodeLevel);
- MoveToNode(NewNodePosition, NodeLevel);
- end;
- end
- else
- Break;
- end;
- if NodeLevel > 1 then
- Exit; // the entry we are looking for is > than the last entry of the last index node
- // now we are in a leafnode
- while ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize) <> False do
- begin
- //WriteLn('Found Leaf Entry: ', NewWord, ' Comparing to ', AWord);
- LastWord := NewWord;
- if Length(NewWord) < Length(AWord) then
- continue;
- if AStartsWith then //it only has to start with the searched term
- CompareResult := ChmCompareText(AWord, Copy(NewWord, 1, Length(AWord)))
- else // it must match exactly
- CompareResult := ChmCompareText(AWord, NewWord);
- if CompareResult < 0 then
- Exit;
- if CompareResult = 0 then
- begin
- if InTitle then
- ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
- else
- Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
- // check if the next entry is the same word since there is an entry for titles and for body
- if (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize)) then
- ReadNextResult := True
- else if (FNextLeafNode <> 0) then
- begin
- MoveToNode(FNextLeafNode, 1);
- LastWord := '';
- ReadNextResult := (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize));
- end;
- if ReadNextResult and (NewWord = AWord) then
- begin
- if InTitle then
- ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
- else
- Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
- end;
- Exit;
- end;
- end;
- end;
- end.
|