chmfiftimain.pas 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092
  1. { Copyright (C) <2008> <Andrew Haines> chmfiftimain.pas
  2. This library is free software; you can redistribute it and/or modify it
  3. under the terms of the GNU Library General Public License as published by
  4. the Free Software Foundation; either version 2 of the License, or (at your
  5. option) any later version.
  6. This program is distributed in the hope that it will be useful, but WITHOUT
  7. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  8. FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
  9. for more details.
  10. You should have received a copy of the GNU Library General Public License
  11. along with this library; if not, write to the Free Software Foundation,
  12. Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  13. }
  14. {
  15. See the file COPYING.FPC, included in this distribution,
  16. for details about the copyright.
  17. }
  18. unit chmFiftiMain;
  19. {$mode objfpc}{$H+}
  20. interface
  21. uses Classes, HTMLIndexer;
  22. type
  23. TFiftiMainHeader = record
  24. Sig: array [0..3] of byte; //$00,$00,$28,$00
  25. HTMLFilesCount: DWord;
  26. RootNodeOffset: DWord;
  27. Unknown1: DWord; // = 0
  28. LeafNodeCount: DWord;
  29. CopyOfRootNodeOffset: DWord;
  30. TreeDepth: Word;
  31. Unknown2: DWord; // = 7
  32. DocIndexScale: Byte;
  33. DocIndexRootSize: Byte;
  34. CodeCountScale: Byte;
  35. CodeCountRootSize: Byte;
  36. LocationCodeScale: Byte;
  37. LocationCodeRootSize: Byte;
  38. Unknown3: array[0..9] of byte; // = 0
  39. NodeSize: DWord; // 4096;
  40. Unknown4: DWord; // 0 or 1;
  41. LastDupWordIndex: DWord;
  42. LastDupCharIndex: DWord;
  43. LongestWordLength: DWord; // maximum 99
  44. TotalWordsIndexed: DWord; // includes duplicates
  45. TotalWords: DWord; // word count not including duplicates
  46. TotalWordsLengthPart1: DWord; // length of all the words with duplicates plus the next dword!
  47. TotalWordsLengthPart2: DWord;
  48. TotalWordsLength: DWord; // length of all words not including duplicates
  49. WordBlockUnusedBytes: DWord; // who knows, this makes no sense when there are more than one blocks
  50. Unknown5: DWord; // 0
  51. HTMLFilesCountMinusOne: DWord; // maybe
  52. Unknown6: array[0..23] of Byte; // 0
  53. WindowsCodePage: DWord; // usually 1252
  54. LocalID: DWord;
  55. //Unknown7: array [0..893] of Byte; // 0
  56. end;
  57. { TFIftiNode }
  58. TFIftiNode = class(TObject)
  59. FLastWord: String;
  60. FWriteStream: TStream;
  61. FBlockStream: TMemoryStream;
  62. ParentNode: TFIftiNode;
  63. OwnsParentNode : boolean;
  64. function AdjustedWord(AWord: String; out AOffset: Byte; AOldWord: String): String;
  65. procedure ChildIsFull(AWord: String; ANodeOffset: DWord); virtual; abstract;
  66. function GuessIfCanHold(AWord: String): Boolean; virtual; abstract;
  67. procedure Flush(NewBlockNeeded: Boolean); virtual; abstract;
  68. procedure FillRemainingSpace;
  69. function RemainingSpace: DWord;
  70. constructor Create(AStream: TStream);
  71. destructor Destroy; override;
  72. end;
  73. { TChmSearchWriter }
  74. TChmSearchWriter = class(TObject)
  75. private
  76. FHeaderRec: TFiftiMainHeader;
  77. FStream: TStream;
  78. FWordList: TIndexedWordList;
  79. FActiveLeafNode: TFIftiNode;
  80. function GetHasData: Boolean;
  81. procedure ProcessWords;
  82. procedure WriteHeader(IsPlaceHolder: Boolean);
  83. procedure WriteAWord(AWord: TIndexedWord);
  84. public
  85. procedure WriteToStream;
  86. property HasData: Boolean read GetHasData;
  87. constructor Create(AStream: TStream; AWordList: TIndexedWordList);
  88. destructor Destroy; override;
  89. end;
  90. { TChmSearchReader }
  91. TChmWLCTopic = record
  92. TopicIndex: DWord;
  93. LocationCodes: array of DWord;
  94. end;
  95. TChmWLCTopicArray = array of TChmWLCTopic;
  96. TChmSearchReader = class;
  97. TChmSearchReaderFoundDataEvent = procedure(Sender: TChmSearchReader; AWord: String; ATopic: DWord; AWordIndex: DWord) of object;
  98. TChmSearchReader = class(TObject)
  99. private
  100. FStream: TStream;
  101. FFileIsValid: Boolean;
  102. FFreeStreamOnDestroy: Boolean;
  103. FDocRootSize,
  104. FCodeCountRootSize,
  105. FLocCodeRootSize: Integer;
  106. FTreeDepth: Integer;
  107. FRootNodeOffset: DWord;
  108. FActiveNodeStart: DWord;
  109. FActiveNodeFreeSpace: Word;
  110. FNextLeafNode: DWord;
  111. procedure ReadCommonData;
  112. procedure MoveToFirstLeafNode;
  113. procedure MoveToRootNode;
  114. procedure MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
  115. function ReadWordOrPartialWord(ALastWord: String): String; // returns the whole word using the last word as a base
  116. function ReadIndexNodeEntry(ALastWord: String; out AWord: String; out ASubNodeStart: DWord): Boolean;
  117. function ReadLeafNodeEntry(ALastWord: String; out AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out AWLCOffset: DWord; out AWLCSize: DWord): Boolean;
  118. function ReadWLCEntries(AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord): TChmWLCTopicArray;
  119. public
  120. constructor Create(AStream: TStream; AFreeStreamOnDestroy: Boolean);
  121. destructor Destroy; override;
  122. procedure DumpData(AFoundDataEvent: TChmSearchReaderFoundDataEvent);
  123. function LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray;
  124. property FileIsValid: Boolean read FFileIsValid;
  125. end;
  126. const
  127. FIFTI_NODE_SIZE = 4096;
  128. implementation
  129. uses SysUtils, Math, ChmBase;
  130. type
  131. { TIndexNode }
  132. TIndexNode = class(TFIftiNode)
  133. function GuessIfCanHold(AWord: String): Boolean; override;
  134. procedure ChildIsFull ( AWord: String; ANodeOffset: DWord ); override;
  135. procedure Flush(NewBlockNeeded: Boolean); override;
  136. end;
  137. { TLeafNode }
  138. TLeafNode = class(TFIftiNode)
  139. FLeafNodeCount: DWord;
  140. FLastNodeStart: DWord;
  141. FreeSpace: DWord;
  142. FDocRootSize,
  143. FCodeRootSize,
  144. FLocRootSize: Byte;
  145. procedure WriteInitialHeader;
  146. Destructor Destroy; override;
  147. function GuessIfCanHold(AWord: String): Boolean; override;
  148. procedure Flush(NewBlockNeeded: Boolean); override;
  149. procedure AddWord(AWord: TIndexedWord);
  150. function WriteWLCEntries(AWord: TIndexedWord; ADocRootSize, ACodeRootSize, ALocRootSize: Byte): DWord;
  151. property LeafNodeCount: DWord read FLeafNodeCount;
  152. property DocRootSize: Byte read FDocRootSize write FDocRootSize;
  153. property CodeRootSize: Byte read FCodeRootSize write FCodeRootSize;
  154. property LocRootSize: Byte read FLocRootSize write FLocRootSize;
  155. end;
  156. function GetCompressedIntegerBE(Stream: TStream): DWord;
  157. var
  158. Buf: Byte;
  159. Value: Dword = 0;
  160. Shift: Integer = 0;
  161. begin
  162. repeat
  163. Buf := Stream.ReadByte;
  164. Value := Value or (Buf and $7F) shl Shift;
  165. Inc(Shift, 7);
  166. until (Buf and $80) = 0;
  167. Result := Value;
  168. end;
  169. procedure WriteCompressedIntegerBE(Stream: TStream; AInt: DWord);
  170. var
  171. Bits: Integer;
  172. Tmp: DWord;
  173. Buf: Byte;
  174. begin
  175. Tmp := AInt;
  176. Bits := 0;
  177. while Tmp <> 0 do
  178. begin
  179. Tmp := Tmp shr 1;
  180. Inc(Bits);
  181. end;
  182. repeat
  183. Buf := (AInt shr (Tmp * 7)) and $7F;
  184. if Bits > 7 then
  185. Buf := Buf or $80;
  186. Dec(Bits, 7);
  187. Inc(Tmp);
  188. Stream.WriteByte(Buf);
  189. until Bits <= 0;
  190. end;
  191. function WriteScaleRootInt(ANumber: DWord; out Bits: DWord; Root: Integer): Byte;
  192. var
  193. Tmp: DWord;
  194. Mask: DWord;
  195. // Scale: Integer;
  196. NeededBits: Integer;
  197. PrefixBits: Integer;
  198. RootBits: Integer;
  199. begin
  200. // Scale := 2;
  201. Bits := 0;
  202. Result := Root;
  203. Tmp := ANumber;
  204. NeededBits := 0;
  205. while Tmp <> 0 do
  206. begin
  207. Inc(NeededBits);
  208. Tmp := Tmp shr 1;
  209. end;
  210. PrefixBits := Max(0, NeededBits-Root);
  211. RootBits := NeededBits -1; //
  212. if RootBits < Root then
  213. RootBits := Root;
  214. if RootBits < 0 then
  215. RootBits := 0;
  216. Mask := 0;
  217. if RootBits-1 >= 0 then
  218. for Tmp := 0 to RootBits-1 do
  219. Mask := Mask or (DWord(1) shl Tmp);
  220. Bits := not Mask;
  221. Bits := Bits shl 1; // make space for empty bit
  222. Bits := Bits or (ANumber and Mask);
  223. Result := PrefixBits + 1 + RootBits;
  224. Bits := (Bits shl (32-Result)) shr (32 - Result);
  225. end;
  226. { TChmSearchWriter }
  227. procedure TChmSearchWriter.ProcessWords;
  228. begin
  229. FWordList.ForEach(@WriteAword);
  230. if FActiveLeafNode <> nil then
  231. FActiveLeafNode.Flush(False); // causes the unwritten parts of the tree to be written
  232. end;
  233. function TChmSearchWriter.GetHasData: Boolean;
  234. begin
  235. Result := FWordList.IndexedFileCount > 0;
  236. end;
  237. procedure TChmSearchWriter.WriteHeader ( IsPlaceHolder: Boolean ) ;
  238. var
  239. TmpNode: TFIftiNode;
  240. i: Integer;
  241. begin
  242. if IsPlaceHolder then
  243. begin
  244. FStream.Size := $400; // the header size. we will fill this after the nodes have been determined
  245. FStream.Position := $400;
  246. FillChar(PChar(TMemoryStream(FStream).Memory)^, $400, 0);
  247. FHeaderRec.DocIndexRootSize := 1;
  248. FHeaderRec.CodeCountRootSize := 1;
  249. FHeaderRec.LocationCodeRootSize := 4;
  250. Exit;
  251. end;
  252. // write the glorious header
  253. FHeaderRec.Sig[2] := $28;
  254. FHeaderRec.HTMLFilesCount := FWordList.IndexedFileCount;
  255. FHeaderRec.RootNodeOffset := FStream.Size-FIFTI_NODE_SIZE;
  256. FHeaderRec.LeafNodeCount := TLeafNode(FActiveLeafNode).LeafNodeCount;
  257. FHeaderRec.CopyOfRootNodeOffset := FHeaderRec.RootNodeOffset;
  258. FHeaderRec.TreeDepth := 0;
  259. TmpNode := FActiveLeafNode;
  260. while TmpNode <> nil do
  261. begin
  262. Inc(FHeaderRec.TreeDepth);
  263. TmpNode := TmpNode.ParentNode;
  264. end;
  265. FHeaderRec.DocIndexScale := 2;
  266. FHeaderRec.CodeCountScale := 2;
  267. FHeaderRec.LocationCodeScale := 2;
  268. //FHeaderRec.DocIndexRootSize := 15;
  269. //FHeaderRec.CodeCountRootSize := 15;
  270. //FHeaderRec.LocationCodeRootSize := 15;
  271. FHeaderRec.NodeSize := FIFTI_NODE_SIZE;
  272. FHeaderRec.LongestWordLength := FWordList.LongestWord;
  273. FHeaderRec.TotalWordsIndexed := FWordList.TotalWordCount;
  274. FHeaderRec.TotalWords := FWordList.TotalDIfferentWords;
  275. FHeaderRec.TotalWordsLengthPart1 := FWordList.TotalWordLength;
  276. FHeaderRec.TotalWordsLength := FWordList.TotalDifferentWordLength;
  277. FHeaderRec.WindowsCodePage := 1252;
  278. FStream.Position := 0;
  279. FStream.Write(FHeaderRec.Sig[0], 4);
  280. FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount));
  281. FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset));
  282. FStream.WriteDWord(NtoLE(0)); // unknown 1
  283. FStream.WriteDWord(NtoLE(FHeaderRec.LeafNodeCount));
  284. FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset)); // yes twice
  285. FStream.WriteWord(NtoLE(FHeaderRec.TreeDepth));
  286. FStream.WriteDWord(NtoLE(DWord(7)));
  287. FStream.WriteByte(2);
  288. FStream.WriteByte(FHeaderRec.DocIndexRootSize);
  289. FStream.WriteByte(2);
  290. FStream.WriteByte(FHeaderRec.CodeCountRootSize);
  291. FStream.WriteByte(2);
  292. FStream.WriteByte(FHeaderRec.LocationCodeRootSize);
  293. // eat 10 bytes
  294. FStream.WriteWord(0);
  295. FStream.WriteDWord(0);
  296. FStream.WriteDWord(0);
  297. FStream.WriteDWord(NtoLE(FHeaderRec.NodeSize));
  298. FStream.WriteDWord(NtoLE(DWord(0)));
  299. FStream.WriteDWord(1);
  300. FStream.WriteDWord(5);
  301. FStream.WriteDWord(NtoLE(FHeaderRec.LongestWordLength));
  302. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsIndexed));
  303. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWords));
  304. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart1));
  305. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart2));
  306. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLength));
  307. FStream.WriteDWord(NtoLE(TLeafNode(FActiveLeafNode).FreeSpace));
  308. FStream.WriteDWord(NtoLE(0));
  309. FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount-1));
  310. for i := 0 to 23 do FStream.WriteByte(0);
  311. FStream.WriteDWord(NtoLE(FHeaderRec.WindowsCodePage));
  312. FStream.WriteDWord(NtoLE(DWord(1033))); // LCID
  313. for i := 0 to 893 do FStream.WriteByte(0);
  314. end;
  315. procedure TChmSearchWriter.WriteAWord ( AWord: TIndexedWord ) ;
  316. begin
  317. if FActiveLeafNode = nil then
  318. begin
  319. FActiveLeafNode := TLeafNode.Create(FStream);
  320. with TLeafNode(FActiveLeafNode) do
  321. begin
  322. DocRootSize := FHeaderRec.DocIndexRootSize;
  323. CodeRootSize := FHeaderRec.CodeCountRootSize;
  324. LocRootSize := FHeaderRec.LocationCodeRootSize;
  325. end;
  326. end;
  327. if FActiveLeafNode.GuessIfCanHold(AWord.TheWord) = False then
  328. begin
  329. FActiveLeafNode.Flush(True);
  330. end;
  331. TLeafNode(FActiveLeafNode).AddWord(AWord);
  332. end;
  333. procedure TChmSearchWriter.WriteToStream;
  334. begin
  335. WriteHeader(True);
  336. ProcessWords;
  337. WriteHeader(False);
  338. end;
  339. constructor TChmSearchWriter.Create ( AStream: TStream;
  340. AWordList: TIndexedWordList ) ;
  341. begin
  342. FStream := AStream;
  343. FWordList := AWordList;
  344. FActiveLeafNode:=NIL;
  345. end;
  346. destructor TChmSearchWriter.Destroy;
  347. begin
  348. freeandnil(FActiveLeafNode);
  349. end;
  350. { TLeafNode }
  351. function TFIftiNode.RemainingSpace: DWord;
  352. begin
  353. Result := FIFTI_NODE_SIZE - FBlockStream.Position;
  354. end;
  355. constructor TFIftiNode.Create ( AStream: TStream ) ;
  356. begin
  357. inherited Create;
  358. FWriteStream := AStream;
  359. FBlockStream := TMemoryStream.Create;
  360. OwnsParentNode :=false;
  361. end;
  362. destructor TFIftiNode.Destroy;
  363. begin
  364. FBlockStream.Free;
  365. if OwnsParentNode then ParentNode.Free;
  366. inherited Destroy;
  367. end;
  368. procedure TFIftiNode.FillRemainingSpace;
  369. begin
  370. while RemainingSpace > 0 do
  371. FBlockStream.WriteByte(0);
  372. end;
  373. function TFIftiNode.AdjustedWord ( AWord: String; out AOffset: Byte; AOldWord: String ) : String;
  374. var
  375. Count1,
  376. Count2: Integer;
  377. Count: Integer;
  378. i: Integer;
  379. begin
  380. if AWord = AOldWord then
  381. begin
  382. AOffset := Length(AWord);
  383. Exit('');
  384. end;
  385. // else
  386. Count1 := Length(AOldWord);
  387. Count2 := Length(AWord);
  388. if Count1<Count2 then
  389. Count := Count1
  390. else
  391. Count := Count2;
  392. for i := 1 to Count do
  393. begin
  394. AOffset := i-1;
  395. if AOldWord[i] <> AWord[i]
  396. then Exit(Copy(AWord, i, Length(AWord)));
  397. end;
  398. Result := AWord;
  399. AOffset := 0;
  400. end;
  401. procedure TLeafNode.WriteInitialHeader;
  402. begin
  403. FBlockStream.WriteDWord(0);
  404. FBlockStream.WriteWord(0);
  405. FBlockStream.WriteWord(0);
  406. end;
  407. destructor TLeafNode.Destroy;
  408. begin
  409. inherited Destroy;
  410. end;
  411. function TLeafNode.GuessIfCanHold ( AWord: String ) : Boolean;
  412. var
  413. WordOffset: Byte;
  414. begin
  415. Result := 17 + Length(AdjustedWord(AWord, WordOffset, FLastWord)) < RemainingSpace;
  416. end;
  417. procedure TLeafNode.Flush(NewBlockNeeded: Boolean);
  418. var
  419. FTmpPos: DWord;
  420. begin
  421. Inc(FLeafNodeCount);
  422. FTmpPos := FWriteStream.Position;
  423. // update the previous leaf node about our position.
  424. if FLastNodeStart > 0 then
  425. begin
  426. FWriteStream.Position := FLastNodeStart;
  427. FWriteStream.WriteDWord(NtoLE(FTmpPos));
  428. FWriteStream.Position := FTmpPos;
  429. end;
  430. FLastNodeStart := FTmpPos;
  431. FreeSpace := RemainingSpace;
  432. FillRemainingSpace;
  433. // update the leaf header to show the available space.
  434. FBlockStream.Position := 6;
  435. FBlockStream.WriteWord(NtoLE(Word(FreeSpace)));
  436. // copy the leaf block to the fiftimain file
  437. FBlockStream.Position := 0;
  438. FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
  439. FBlockStream.Position := 0;
  440. if NewBlockNeeded or ((NewBlockNeeded = False) and (ParentNode <> nil)) then
  441. begin
  442. if ParentNode = nil then
  443. begin
  444. ParentNode := TIndexNode.Create(FWriteStream);
  445. OwnsParentNode:=True;
  446. end;
  447. ParentNode.ChildIsFull(FLastWord, FLastNodeStart);
  448. if (NewBlockNeeded = False) then
  449. ParentNode.Flush(False);
  450. end;
  451. FLastWord := '';
  452. end;
  453. procedure TLeafNode.AddWord ( AWord: TIndexedWord ) ;
  454. var
  455. Offset: Byte;
  456. NewWord: String;
  457. WLCSize: DWord;
  458. begin
  459. if Length(AWord.TheWord) > 99 then
  460. Exit; // Maximum word length is 99
  461. if FBlockStream.Position = 0 then
  462. WriteInitialHeader;
  463. NewWord := AdjustedWord(AWord.TheWord, Offset, FLastWord);
  464. FLastWord := AWord.TheWord;
  465. FBlockStream.WriteByte(Length(NewWord)+1);
  466. FBlockStream.WriteByte(Offset);
  467. // length can be 0 if it is the same word as the last. there is a word entry each for title and content
  468. if Length(NewWord) > 0 then
  469. FBlockStream.Write(NewWord[1], Length(NewWord));
  470. FBlockStream.WriteByte(Ord(AWord.IsTitle));
  471. WriteCompressedIntegerBE(FBlockStream, AWord.DocumentCount);
  472. FBlockStream.WriteDWord(NtoLE(DWord(FWriteStream.Position)));
  473. FBlockStream.WriteWord(0);
  474. // write WLC to FWriteStream so we can write the size of the wlc entries
  475. WLCSize := WriteWLCEntries(AWord, FDocRootSize, FCodeRootSize, FLocRootSize);
  476. WriteCompressedIntegerBE(FBlockStream, WLCSize);
  477. if FBlockStream.Position > FIFTI_NODE_SIZE then
  478. raise Exception.Create('FIFTIMAIN Leaf node has written past the block!');
  479. end;
  480. function Min(AValue, BValue: Byte): Byte;
  481. begin
  482. if AValue < BValue then
  483. Result := AValue
  484. else Result := BValue;
  485. end;
  486. function Max(AValue, BValue: Byte): Byte;
  487. begin
  488. if AValue > BValue then
  489. Result := AValue
  490. else Result := BValue;
  491. end;
  492. function Max(AValue, BValue: Integer): Integer;
  493. begin
  494. if AValue > BValue then
  495. Result := AValue
  496. else Result := BValue;
  497. end;
  498. function Max(AValue, BValue: DWord): DWord;
  499. begin
  500. if AValue > BValue then
  501. Result := AValue
  502. else Result := BValue;
  503. end;
  504. function TLeafNode.WriteWLCEntries ( AWord: TIndexedWord ; ADocRootSize, ACodeRootSize, ALocRootSize: Byte) : DWord;
  505. var
  506. LastDocIndex: DWord;
  507. LastLocCode: DWord;
  508. UsedBits: Byte;
  509. Buf: Byte;
  510. function NewDocDelta(ADocIndex: DWord): DWord;
  511. begin
  512. Result := ADocIndex - LastDocIndex;
  513. LastDocIndex := ADocIndex;
  514. end;
  515. function NewLocCode(ALocCode: DWord): DWord;
  516. begin
  517. Result := ALocCode - LastLocCode;
  518. LastLocCode := ALocCode;
  519. end;
  520. procedure AddValue(AValue: DWord; BitCount: Byte);
  521. var
  522. NeededBits: Byte;
  523. Tmp: Byte;
  524. begin
  525. AValue := AValue shl (32 - BitCount);
  526. while BitCount > 0 do
  527. begin
  528. NeededBits := 8 - UsedBits;
  529. Tmp := Hi(Hi(DWord(AValue shr (UsedBits))));
  530. Buf := Buf or Tmp;
  531. Inc(UsedBits, Min(BitCount, NeededBits));
  532. AValue := AValue shl Min(BitCount, NeededBits);
  533. Dec(BitCount, Min(BitCount, NeededBits));
  534. if (UsedBits = 8) then
  535. begin
  536. FWriteStream.WriteByte(Buf);
  537. UsedBits := 0;
  538. NeededBits := 0;
  539. Buf := 0;
  540. end;
  541. end;
  542. end;
  543. procedure FlushBuffer;
  544. begin
  545. if UsedBits > 0 then
  546. FWriteStream.WriteByte(Buf);
  547. UsedBits := 0;
  548. Buf := 0;
  549. end;
  550. var
  551. DocDelta: DWord;
  552. LocDelta: DWord;
  553. StartPos: DWord;
  554. Bits: DWord;
  555. BitCount: Byte;
  556. i,
  557. j: Integer;
  558. Doc: TIndexDocument;
  559. // proced
  560. begin
  561. StartPos := FWriteStream.Position;
  562. LastDocIndex := 0;
  563. UsedBits := 0;
  564. Buf := 0;
  565. for i := 0 to AWord.DocumentCount-1 do
  566. begin
  567. LastLocCode := 0;
  568. Doc := AWord.GetLogicalDocument(i);
  569. DocDelta := NewDocDelta(Doc.DocumentIndex);
  570. BitCount := WriteScaleRootInt(DocDelta, Bits, ADocRootSize);
  571. AddValue(Bits, BitCount);
  572. BitCount := WriteScaleRootInt(Doc.NumberOfIndexEntries, Bits, ACodeRootSize);
  573. AddValue(Bits, BitCount);
  574. for j := 0 to Doc.NumberOfIndexEntries-1 do
  575. begin
  576. LocDelta := NewLocCode(Doc.IndexEntry[j]);
  577. BitCount := WriteScaleRootInt(LocDelta, Bits, ALocRootSize);
  578. AddValue(Bits, BitCount);
  579. end;
  580. FlushBuffer;
  581. end;
  582. Result := FWriteStream.Position-StartPos;
  583. end;
  584. { TIndexNode }
  585. function TIndexNode.GuessIfCanHold ( AWord: String ) : Boolean;
  586. var
  587. Offset: Byte;
  588. begin
  589. Result := FBlockStream.Position + 8 + Length(AdjustedWord(AWord, Offset, FLastWord)) < FIFTI_NODE_SIZE;
  590. end;
  591. procedure TIndexNode.ChildIsFull ( AWord: String; ANodeOffset: DWord ) ;
  592. var
  593. Offset: Byte;
  594. NewWord: String;
  595. begin
  596. if FBlockStream.Position = 0 then
  597. FBlockStream.WriteWord(0); // free space at end. updated when the block is flushed
  598. if GuessIfCanHold(AWord) = False then
  599. Flush(True);
  600. NewWord := AdjustedWord(AWord, Offset, FLastWord);
  601. FLastWord:=AWord;
  602. // Write the Index node Entry
  603. FBlockStream.WriteByte(Length(NewWord)+1);
  604. FBlockStream.WriteByte(Offset);
  605. FBlockStream.Write(NewWord[1], Length(NewWord));
  606. FBlockStream.WriteDWord(NtoLE(ANodeOffset));
  607. FBlockStream.WriteWord(0);
  608. if FBlockStream.Position > FIFTI_NODE_SIZE then
  609. raise Exception.Create('FIFTIMAIN Index node has written past the block!');
  610. end;
  611. procedure TIndexNode.Flush ( NewBlockNeeded: Boolean ) ;
  612. var
  613. RemSize: DWord;
  614. begin
  615. if NewBlockNeeded then
  616. begin
  617. if ParentNode = nil then
  618. begin
  619. ParentNode := TIndexNode.Create(FWriteStream);
  620. OwnsParentNode:=True;
  621. end;
  622. end;
  623. if ParentNode <> nil then
  624. ParentNode.ChildIsFull(FLastWord, FWriteStream.Position);
  625. RemSize := RemainingSpace;
  626. FillRemainingSpace;
  627. FBlockStream.Position := 0;
  628. FBlockStream.WriteWord(NtoLE(RemSize));
  629. FBlockStream.Position := 0;
  630. FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
  631. FBlockStream.Position := 0;
  632. FLastWord := '';
  633. if NewBlockNeeded then
  634. FBlockStream.WriteDWord(0) // placeholder to write free space in when block is full
  635. else
  636. if ParentNode <> nil then
  637. ParentNode.Flush(NewBlockNeeded);
  638. end;
  639. { TChmSearchReader }
  640. procedure TChmSearchReader.ReadCommonData;
  641. var
  642. Sig: DWord;
  643. begin
  644. FStream.Position := 0;
  645. Sig := LEtoN(FStream.ReadDWord);
  646. FFileIsValid := Sig = $00280000;
  647. if not FileIsValid then
  648. Exit;
  649. // root node address
  650. FStream.Position := $8;
  651. FRootNodeOffset := LEtoN(FStream.ReadDWord);
  652. // Tree Depth
  653. FStream.Position := $18;
  654. FTreeDepth := LEtoN(FStream.ReadWord);
  655. // Root sizes for scale and root integers
  656. FStream.Position := $1E;
  657. if FStream.ReadByte <> 2 then // we only can read the files when scale is 2
  658. FFileIsValid := False;
  659. FDocRootSize := FStream.ReadByte;
  660. if FStream.ReadByte <> 2 then
  661. FFileIsValid := False;
  662. FCodeCountRootSize := FStream.ReadByte;
  663. if FStream.ReadByte <> 2 then
  664. FFileIsValid := False;
  665. FLocCodeRootSize := FStream.ReadByte;
  666. end;
  667. procedure TChmSearchReader.MoveToFirstLeafNode;
  668. var
  669. NodeDepth: Integer;
  670. NodeOffset: DWord;
  671. LastWord: String;
  672. NewWord: String;
  673. begin
  674. NodeDepth := FTreeDepth;
  675. MoveToRootNode;
  676. while NodeDepth > 1 do
  677. begin
  678. LastWord := '';
  679. ReadIndexNodeEntry(LastWord, NewWord, NodeOffset);
  680. Dec(NodeDepth);
  681. MoveToNode(NodeOffset, NodeDepth);
  682. end;
  683. end;
  684. procedure TChmSearchReader.MoveToRootNode;
  685. begin
  686. MoveToNode(FRootNodeOffset, FTreeDepth);
  687. end;
  688. procedure TChmSearchReader.MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
  689. begin
  690. FStream.Position := ANodeOffset;
  691. FActiveNodeStart := FStream.Position;
  692. if ANodeDepth > 1 then
  693. begin
  694. FnextLeafNode := 0;
  695. FActiveNodeFreeSpace := LEtoN(FStream.ReadWord); // empty space at end of node
  696. end
  697. else
  698. begin
  699. FnextLeafNode := LEtoN(FStream.ReadDWord);
  700. FStream.ReadWord;
  701. FActiveNodeFreeSpace := LEtoN(FStream.ReadWord);
  702. end;
  703. end;
  704. function TChmSearchReader.ReadWordOrPartialWord ( ALastWord: String ) : String;
  705. var
  706. WordLength: Integer;
  707. CopyLastWordCharCount: Integer;
  708. begin
  709. WordLength := FStream.ReadByte;
  710. CopyLastWordCharCount := FStream.ReadByte;
  711. if CopyLastWordCharCount > 0 then
  712. Result := Copy(ALastWord, 1, CopyLastWordCharCount);
  713. SetLength(Result, (WordLength-1) + CopyLastWordCharCount);
  714. FStream.Read(Result[1+CopyLastWordCharCount], WordLength-1);
  715. end;
  716. function TChmSearchReader.ReadIndexNodeEntry (ALastWord: String; out AWord: String; out
  717. ASubNodeStart: DWord ): Boolean;
  718. begin
  719. Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  720. if not Result then
  721. Exit;
  722. AWord := ReadWordOrPartialWord(ALastWord);
  723. ASubNodeStart := LEtoN(FStream.ReadDWord);
  724. FStream.ReadWord;
  725. end;
  726. function TChmSearchReader.ReadLeafNodeEntry ( ALastWord: String; out
  727. AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out
  728. AWLCOffset: DWord; out AWLCSize: DWord ): Boolean;
  729. begin
  730. Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  731. if not Result then
  732. Exit;
  733. AWord := ReadWordOrPartialWord(ALastWord);
  734. AInTitle := FStream.ReadByte = 1;
  735. AWLCCount := GetCompressedIntegerBE(FStream);
  736. AWLCOffset := LEtoN(FStream.ReadDWord);
  737. FStream.ReadWord;
  738. AWLCSize := GetCompressedIntegerBE(FStream);
  739. end;
  740. function TChmSearchReader.ReadWLCEntries (AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord ) : TChmWLCTopicArray;
  741. function AtEndOfWLCEntries: Boolean;
  742. begin
  743. Result := AWLCOffset + AWLCSize <= FStream.Position;
  744. end;
  745. var
  746. Buf: Byte;
  747. BitsInBuffer: Integer;
  748. procedure FillBuffer;
  749. begin
  750. while (BitsInBuffer = 0) and not AtEndOfWLCEntries do
  751. begin
  752. Buf := FStream.ReadByte;
  753. Inc(BitsInBuffer, 8);
  754. end;
  755. end;
  756. function ReadWLC(RootSize: DWord): DWord;
  757. var
  758. PrefixBits: Integer = 0;
  759. BitCount: Integer = 0;
  760. RemainingBits: Integer; // only the bits for this number not the bits in buffer
  761. begin
  762. FillBuffer;
  763. Result := 0;
  764. while (Buf and $80) > 0 do // find out how many prefix bits there are
  765. begin
  766. Inc(PrefixBits);
  767. Buf := Buf shl 1;
  768. Dec(BitsInBuffer);
  769. FillBuffer;
  770. end;
  771. if PrefixBits > 0 then
  772. Result := 1;
  773. Inc(BitCount, PrefixBits+1);
  774. Buf := Buf shl 1;
  775. Dec(BitsInBuffer);
  776. FillBuffer;
  777. Remainingbits := RootSize + Max(Integer(PrefixBits-1), 0);
  778. while RemainingBits > 0 do
  779. begin
  780. Result := Result shl 1;
  781. Result := Result or (Buf shr 7);
  782. Dec(RemainingBits);
  783. Buf := Buf shl 1;
  784. Dec(BitsInBuffer);
  785. FillBuffer;
  786. Inc(BitCount);
  787. end;
  788. end;
  789. procedure ClearBuffer;
  790. begin
  791. BitsInBuffer := 0;
  792. Buf := 0;
  793. end;
  794. var
  795. TopicHits: DWord;
  796. i: Integer;
  797. j: Integer;
  798. CachedStreamPos: QWord;
  799. LastDoc,
  800. LastLocCode: DWord;
  801. begin
  802. CachedStreamPos := FStream.Position;
  803. FStream.Position := AWLCOffset;
  804. {for i := 0 to AWLCSize-1 do
  805. begin
  806. Buf := FStream.ReadByte;
  807. Write(binStr(Buf, 8), ' ');
  808. end;}
  809. FStream.Position := AWLCOffset;
  810. SetLength(Result, AWLCCount);
  811. Buf := 0;
  812. BitsInBuffer := 0;
  813. LastDoc := 0;
  814. for i := 0 to AWLCCount-1 do
  815. begin
  816. Result[i].TopicIndex := ReadWLC(FDocRootSize) + LastDoc;
  817. LastDoc := Result[i].TopicIndex;
  818. TopicHits := ReadWLC(FCodeCountRootSize);
  819. SetLength(Result[i].LocationCodes, TopicHits);
  820. LastLocCode := 0;
  821. for j := 0 to TopicHits-1 do
  822. begin
  823. Result[i].LocationCodes[j] := ReadWLC(FLocCodeRootSize) + LastLocCode;
  824. LastLocCode := Result[i].LocationCodes[j];
  825. end;
  826. ClearBuffer;
  827. end;
  828. FStream.Position := CachedStreamPos;
  829. end;
  830. constructor TChmSearchReader.Create ( AStream: TStream;
  831. AFreeStreamOnDestroy: Boolean ) ;
  832. begin
  833. FStream := AStream;
  834. FFreeStreamOnDestroy := AFreeStreamOnDestroy;
  835. ReadCommonData;
  836. end;
  837. destructor TChmSearchReader.Destroy;
  838. begin
  839. if FFreeStreamOnDestroy then
  840. FreeAndNil(FStream);
  841. inherited Destroy;
  842. end;
  843. procedure TChmSearchReader.DumpData (
  844. AFoundDataEvent: TChmSearchReaderFoundDataEvent ) ;
  845. var
  846. LastWord: String;
  847. TheWord: String;
  848. InTitle: Boolean;
  849. WLCCount: DWord;
  850. WLCOffset: DWord;
  851. WLCSize: DWord;
  852. FoundHits: TChmWLCTopicArray;
  853. i: Integer;
  854. j: Integer;
  855. begin
  856. MoveToFirstLeafNode;
  857. LastWord := '';
  858. repeat
  859. if (ReadLeafNodeEntry(LastWord, TheWord, InTitle, WLCCount, WLCOffset, WLCSize) = False) then
  860. begin
  861. if FnextLeafNode <> 0 then
  862. begin
  863. MoveToNode(FnextLeafNode, 1);
  864. LastWord := '';
  865. end
  866. else
  867. Break;
  868. end
  869. else begin
  870. LastWord := TheWord;
  871. //WriteLn('Reading Hits for ', TheWord ,' at ', hexstr(WLCOffset,8) );
  872. FoundHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  873. //WriteLn('DONE Reading Hits for ', TheWord);
  874. // AFoundDataEvent(Self, TheWord, 0,0);//FoundHits[i].TopicIndex ,-1);//FoundHits[i].LocationCodes[j]);
  875. for i := 0 to High(FoundHits) do
  876. for j := 0 to High(FoundHits[i].LocationCodes) do
  877. AFoundDataEvent(Self, TheWord, FoundHits[i].TopicIndex ,FoundHits[i].LocationCodes[j]);
  878. end;
  879. until False; //FStream.Position - FActiveNodeStart >= FIFTI_NODE_SIZE - FActiveNodeFreeSpace
  880. end;
  881. function TChmSearchReader.LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray;
  882. var
  883. LastWord: String;
  884. NewWord: String;
  885. NodeLevel: Integer;
  886. NewNodePosition: DWord;
  887. InTitle: Boolean;
  888. WLCCount: DWord;
  889. WLCOffset: DWord;
  890. WLCSize: DWord;
  891. CompareResult: Integer;
  892. ReadNextResult: Boolean;
  893. begin
  894. AWord := LowerCase(AWord);
  895. NodeLevel := FTreeDepth;
  896. MoveToRootNode;
  897. SetLength(Result, 0);
  898. LastWord := '';
  899. // descend the index node tree until we find the leafnode
  900. while NodeLevel > 1 do begin
  901. //WriteLn('At Node Level ', NodeLevel);
  902. if ReadIndexNodeEntry(LastWord, NewWord, NewNodePosition) <> False then
  903. begin
  904. LastWord := NewWord;
  905. //WriteLn('Found Index Entry: ', NewWord, ' Comparing to ', AWord);
  906. if ChmCompareText(NewWord, AWord) >= 0 then
  907. begin
  908. LastWord := '';
  909. Dec(NodeLevel);
  910. MoveToNode(NewNodePosition, NodeLevel);
  911. end;
  912. end
  913. else
  914. Break;
  915. end;
  916. if NodeLevel > 1 then
  917. Exit; // the entry we are looking for is > than the last entry of the last index node
  918. // now we are in a leafnode
  919. while ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize) <> False do
  920. begin
  921. //WriteLn('Found Leaf Entry: ', NewWord, ' Comparing to ', AWord);
  922. LastWord := NewWord;
  923. if Length(NewWord) < Length(AWord) then
  924. continue;
  925. if AStartsWith then //it only has to start with the searched term
  926. CompareResult := ChmCompareText(AWord, Copy(NewWord, 1, Length(AWord)))
  927. else // it must match exactly
  928. CompareResult := ChmCompareText(AWord, NewWord);
  929. if CompareResult < 0 then
  930. Exit;
  931. if CompareResult = 0 then
  932. begin
  933. if InTitle then
  934. ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
  935. else
  936. Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  937. // check if the next entry is the same word since there is an entry for titles and for body
  938. if (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize)) then
  939. ReadNextResult := True
  940. else if (FNextLeafNode <> 0) then
  941. begin
  942. MoveToNode(FNextLeafNode, 1);
  943. LastWord := '';
  944. ReadNextResult := (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize));
  945. end;
  946. if ReadNextResult and (NewWord = AWord) then
  947. begin
  948. if InTitle then
  949. ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
  950. else
  951. Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  952. end;
  953. Exit;
  954. end;
  955. end;
  956. end;
  957. end.