chmfiftimain.pas 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098
  1. { Copyright (C) <2008> <Andrew Haines> chmfiftimain.pas
  2. This library is free software; you can redistribute it and/or modify it
  3. under the terms of the GNU Library General Public License as published by
  4. the Free Software Foundation; either version 2 of the License, or (at your
  5. option) any later version.
  6. This program is distributed in the hope that it will be useful, but WITHOUT
  7. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  8. FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
  9. for more details.
  10. You should have received a copy of the GNU Library General Public License
  11. along with this library; if not, write to the Free Software Foundation,
  12. Inc., i51 Franklin Street, Fifth Floor, Boston, MA 02111-1301, USA.
  13. }
  14. {
  15. See the file COPYING.FPC, included in this distribution,
  16. for details about the copyright.
  17. }
  18. unit chmFiftiMain;
  19. {$mode objfpc}{$H+}
  20. interface
  21. uses Classes, HTMLIndexer;
  22. type
  23. TFiftiMainHeader = record
  24. Sig: array [0..3] of byte; //$00,$00,$28,$00
  25. HTMLFilesCount: DWord;
  26. RootNodeOffset: DWord;
  27. Unknown1: DWord; // = 0
  28. LeafNodeCount: DWord;
  29. CopyOfRootNodeOffset: DWord;
  30. TreeDepth: Word;
  31. Unknown2: DWord; // = 7
  32. DocIndexScale: Byte;
  33. DocIndexRootSize: Byte;
  34. CodeCountScale: Byte;
  35. CodeCountRootSize: Byte;
  36. LocationCodeScale: Byte;
  37. LocationCodeRootSize: Byte;
  38. Unknown3: array[0..9] of byte; // = 0
  39. NodeSize: DWord; // 4096;
  40. Unknown4: DWord; // 0 or 1;
  41. LastDupWordIndex: DWord;
  42. LastDupCharIndex: DWord;
  43. LongestWordLength: DWord; // maximum 99
  44. TotalWordsIndexed: DWord; // includes duplicates
  45. TotalWords: DWord; // word count not including duplicates
  46. TotalWordsLengthPart1: DWord; // length of all the words with duplicates plus the next dword!
  47. TotalWordsLengthPart2: DWord;
  48. TotalWordsLength: DWord; // length of all words not including duplicates
  49. WordBlockUnusedBytes: DWord; // who knows, this makes no sense when there are more than one blocks
  50. Unknown5: DWord; // 0
  51. HTMLFilesCountMinusOne: DWord; // maybe
  52. Unknown6: array[0..23] of Byte; // 0
  53. WindowsCodePage: DWord; // usually 1252
  54. LocalID: DWord;
  55. //Unknown7: array [0..893] of Byte; // 0
  56. end;
  57. { TFIftiNode }
  58. TFIftiNode = class(TObject)
  59. FLastWord: String;
  60. FWriteStream: TStream;
  61. FBlockStream: TMemoryStream;
  62. ParentNode: TFIftiNode;
  63. OwnsParentNode : boolean;
  64. function AdjustedWord(AWord: String; out AOffset: Byte; AOldWord: String): String;
  65. procedure ChildIsFull(AWord: String; ANodeOffset: DWord); virtual; abstract;
  66. function GuessIfCanHold(AWord: String): Boolean; virtual; abstract;
  67. procedure Flush(NewBlockNeeded: Boolean); virtual; abstract;
  68. procedure FillRemainingSpace;
  69. function RemainingSpace: DWord;
  70. constructor Create(AStream: TStream);
  71. destructor Destroy; override;
  72. end;
  73. { TChmSearchWriter }
  74. TChmSearchWriter = class(TObject)
  75. private
  76. FHeaderRec: TFiftiMainHeader;
  77. FStream: TStream;
  78. FWordList: TIndexedWordList;
  79. FActiveLeafNode: TFIftiNode;
  80. function GetHasData: Boolean;
  81. procedure ProcessWords;
  82. procedure WriteHeader(IsPlaceHolder: Boolean);
  83. procedure WriteAWord(AWord: TIndexedWord);
  84. public
  85. procedure WriteToStream;
  86. property HasData: Boolean read GetHasData;
  87. constructor Create(AStream: TStream; AWordList: TIndexedWordList);
  88. destructor Destroy; override;
  89. end;
  90. { TChmSearchReader }
  91. TChmWLCTopic = record
  92. TopicIndex: DWord;
  93. LocationCodes: array of DWord;
  94. end;
  95. TChmWLCTopicArray = array of TChmWLCTopic;
  96. TChmSearchReader = class;
  97. TChmSearchReaderFoundDataEvent = procedure(Sender: TChmSearchReader; AWord: String; ATopic: DWord; AWordIndex: DWord) of object;
  98. TChmSearchReader = class(TObject)
  99. private
  100. FStream: TStream;
  101. FFileIsValid: Boolean;
  102. FFreeStreamOnDestroy: Boolean;
  103. FDocRootSize,
  104. FCodeCountRootSize,
  105. FLocCodeRootSize: Integer;
  106. FTreeDepth: Integer;
  107. FRootNodeOffset: DWord;
  108. FActiveNodeStart: DWord;
  109. FActiveNodeFreeSpace: Word;
  110. FNextLeafNode: DWord;
  111. procedure ReadCommonData;
  112. procedure MoveToFirstLeafNode;
  113. procedure MoveToRootNode;
  114. procedure MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
  115. function ReadWordOrPartialWord(ALastWord: String): String; // returns the whole word using the last word as a base
  116. function ReadIndexNodeEntry(ALastWord: String; out AWord: String; out ASubNodeStart: DWord): Boolean;
  117. function ReadLeafNodeEntry(ALastWord: String; out AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out AWLCOffset: DWord; out AWLCSize: DWord): Boolean;
  118. function ReadWLCEntries(AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord): TChmWLCTopicArray;
  119. public
  120. constructor Create(AStream: TStream; AFreeStreamOnDestroy: Boolean);
  121. destructor Destroy; override;
  122. procedure DumpData(AFoundDataEvent: TChmSearchReaderFoundDataEvent);
  123. function LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray;
  124. property FileIsValid: Boolean read FFileIsValid;
  125. end;
  126. const
  127. FIFTI_NODE_SIZE = 4096;
  128. implementation
  129. uses SysUtils, Math, ChmBase;
  130. type
  131. { TIndexNode }
  132. TIndexNode = class(TFIftiNode)
  133. function GuessIfCanHold(AWord: String): Boolean; override;
  134. procedure ChildIsFull ( AWord: String; ANodeOffset: DWord ); override;
  135. procedure Flush(NewBlockNeeded: Boolean); override;
  136. end;
  137. { TLeafNode }
  138. TLeafNode = class(TFIftiNode)
  139. FLeafNodeCount: DWord;
  140. FLastNodeStart: DWord;
  141. FreeSpace: DWord;
  142. FDocRootSize,
  143. FCodeRootSize,
  144. FLocRootSize: Byte;
  145. procedure WriteInitialHeader;
  146. Destructor Destroy; override;
  147. function GuessIfCanHold(AWord: String): Boolean; override;
  148. procedure Flush(NewBlockNeeded: Boolean); override;
  149. procedure AddWord(AWord: TIndexedWord);
  150. function WriteWLCEntries(AWord: TIndexedWord; ADocRootSize, ACodeRootSize, ALocRootSize: Byte): DWord;
  151. property LeafNodeCount: DWord read FLeafNodeCount;
  152. property DocRootSize: Byte read FDocRootSize write FDocRootSize;
  153. property CodeRootSize: Byte read FCodeRootSize write FCodeRootSize;
  154. property LocRootSize: Byte read FLocRootSize write FLocRootSize;
  155. end;
  156. function GetCompressedIntegerBE(Stream: TStream): DWord;
  157. var
  158. Buf: Byte;
  159. Value: Dword = 0;
  160. Shift: Integer = 0;
  161. begin
  162. repeat
  163. Buf := Stream.ReadByte;
  164. Value := Value or (Buf and $7F) shl Shift;
  165. Inc(Shift, 7);
  166. until (Buf and $80) = 0;
  167. Result := Value;
  168. end;
  169. procedure WriteCompressedIntegerBE(Stream: TStream; AInt: DWord);
  170. var
  171. Bits: Integer;
  172. Tmp: DWord;
  173. Buf: Byte;
  174. begin
  175. Tmp := AInt;
  176. Bits := 0;
  177. while Tmp <> 0 do
  178. begin
  179. Tmp := Tmp shr 1;
  180. Inc(Bits);
  181. end;
  182. repeat
  183. Buf := (AInt shr (Tmp * 7)) and $7F;
  184. if Bits > 7 then
  185. Buf := Buf or $80;
  186. Dec(Bits, 7);
  187. Inc(Tmp);
  188. Stream.WriteByte(Buf);
  189. until Bits <= 0;
  190. end;
  191. function WriteScaleRootInt(ANumber: DWord; out Bits: DWord; Root: Integer): Byte;
  192. var
  193. Tmp: DWord;
  194. Mask: DWord;
  195. // Scale: Integer;
  196. NeededBits: Integer;
  197. PrefixBits: Integer;
  198. RootBits: Integer;
  199. begin
  200. // Scale := 2;
  201. Bits := 0;
  202. Result := Root;
  203. Tmp := ANumber;
  204. NeededBits := 0;
  205. while Tmp <> 0 do
  206. begin
  207. Inc(NeededBits);
  208. Tmp := Tmp shr 1;
  209. end;
  210. PrefixBits := Max(0, NeededBits-Root);
  211. RootBits := NeededBits -1; //
  212. if RootBits < Root then
  213. RootBits := Root;
  214. if RootBits < 0 then
  215. RootBits := 0;
  216. Mask := 0;
  217. if RootBits-1 >= 0 then
  218. for Tmp := 0 to RootBits-1 do
  219. Mask := Mask or (DWord(1) shl Tmp);
  220. Bits := not Mask;
  221. Bits := Bits shl 1; // make space for empty bit
  222. Bits := Bits or (ANumber and Mask);
  223. Result := PrefixBits + 1 + RootBits;
  224. Bits := (Bits shl (32-Result)) shr (32 - Result);
  225. end;
  226. { TChmSearchWriter }
  227. procedure TChmSearchWriter.ProcessWords;
  228. begin
  229. FWordList.ForEach(@WriteAword);
  230. if FActiveLeafNode <> nil then
  231. FActiveLeafNode.Flush(False); // causes the unwritten parts of the tree to be written
  232. end;
  233. function TChmSearchWriter.GetHasData: Boolean;
  234. begin
  235. Result := FWordList.IndexedFileCount > 0;
  236. end;
  237. procedure TChmSearchWriter.WriteHeader ( IsPlaceHolder: Boolean ) ;
  238. var
  239. TmpNode: TFIftiNode;
  240. i: Integer;
  241. begin
  242. if IsPlaceHolder then
  243. begin
  244. FStream.Size := $400; // the header size. we will fill this after the nodes have been determined
  245. FStream.Position := $400;
  246. FillChar(PChar(TMemoryStream(FStream).Memory)^, $400, 0);
  247. FHeaderRec.DocIndexRootSize := 1;
  248. FHeaderRec.CodeCountRootSize := 1;
  249. FHeaderRec.LocationCodeRootSize := 4;
  250. Exit;
  251. end;
  252. // write the glorious header
  253. FHeaderRec.Sig[2] := $28;
  254. FHeaderRec.HTMLFilesCount := FWordList.IndexedFileCount;
  255. FHeaderRec.RootNodeOffset := FStream.Size-FIFTI_NODE_SIZE;
  256. FHeaderRec.LeafNodeCount := TLeafNode(FActiveLeafNode).LeafNodeCount;
  257. FHeaderRec.CopyOfRootNodeOffset := FHeaderRec.RootNodeOffset;
  258. FHeaderRec.TreeDepth := 0;
  259. TmpNode := FActiveLeafNode;
  260. while TmpNode <> nil do
  261. begin
  262. Inc(FHeaderRec.TreeDepth);
  263. TmpNode := TmpNode.ParentNode;
  264. end;
  265. FHeaderRec.DocIndexScale := 2;
  266. FHeaderRec.CodeCountScale := 2;
  267. FHeaderRec.LocationCodeScale := 2;
  268. //FHeaderRec.DocIndexRootSize := 15;
  269. //FHeaderRec.CodeCountRootSize := 15;
  270. //FHeaderRec.LocationCodeRootSize := 15;
  271. FHeaderRec.NodeSize := FIFTI_NODE_SIZE;
  272. FHeaderRec.LongestWordLength := FWordList.LongestWord;
  273. FHeaderRec.TotalWordsIndexed := FWordList.TotalWordCount;
  274. FHeaderRec.TotalWords := FWordList.TotalDIfferentWords;
  275. FHeaderRec.TotalWordsLengthPart1 := FWordList.TotalWordLength;
  276. FHeaderRec.TotalWordsLength := FWordList.TotalDifferentWordLength;
  277. FHeaderRec.WindowsCodePage := 1252;
  278. FStream.Position := 0;
  279. FStream.Write(FHeaderRec.Sig[0], 4);
  280. FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount));
  281. FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset));
  282. FStream.WriteDWord(NtoLE(0)); // unknown 1
  283. FStream.WriteDWord(NtoLE(FHeaderRec.LeafNodeCount));
  284. FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset)); // yes twice
  285. FStream.WriteWord(NtoLE(FHeaderRec.TreeDepth));
  286. FStream.WriteDWord(NtoLE(DWord(7)));
  287. FStream.WriteByte(2);
  288. FStream.WriteByte(FHeaderRec.DocIndexRootSize);
  289. FStream.WriteByte(2);
  290. FStream.WriteByte(FHeaderRec.CodeCountRootSize);
  291. FStream.WriteByte(2);
  292. FStream.WriteByte(FHeaderRec.LocationCodeRootSize);
  293. // eat 10 bytes
  294. FStream.WriteWord(0);
  295. FStream.WriteDWord(0);
  296. FStream.WriteDWord(0);
  297. FStream.WriteDWord(NtoLE(FHeaderRec.NodeSize));
  298. FStream.WriteDWord(NtoLE(DWord(0)));
  299. FStream.WriteDWord(1);
  300. FStream.WriteDWord(5);
  301. FStream.WriteDWord(NtoLE(FHeaderRec.LongestWordLength));
  302. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsIndexed));
  303. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWords));
  304. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart1));
  305. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart2));
  306. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLength));
  307. FStream.WriteDWord(NtoLE(TLeafNode(FActiveLeafNode).FreeSpace));
  308. FStream.WriteDWord(NtoLE(0));
  309. FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount-1));
  310. for i := 0 to 23 do FStream.WriteByte(0);
  311. FStream.WriteDWord(NtoLE(FHeaderRec.WindowsCodePage));
  312. FStream.WriteDWord(NtoLE(DWord(1033))); // LCID
  313. for i := 0 to 893 do FStream.WriteByte(0);
  314. end;
  315. procedure TChmSearchWriter.WriteAWord ( AWord: TIndexedWord ) ;
  316. begin
  317. if FActiveLeafNode = nil then
  318. begin
  319. FActiveLeafNode := TLeafNode.Create(FStream);
  320. with TLeafNode(FActiveLeafNode) do
  321. begin
  322. DocRootSize := FHeaderRec.DocIndexRootSize;
  323. CodeRootSize := FHeaderRec.CodeCountRootSize;
  324. LocRootSize := FHeaderRec.LocationCodeRootSize;
  325. end;
  326. end;
  327. if FActiveLeafNode.GuessIfCanHold(AWord.TheWord) = False then
  328. begin
  329. FActiveLeafNode.Flush(True);
  330. end;
  331. TLeafNode(FActiveLeafNode).AddWord(AWord);
  332. end;
  333. procedure TChmSearchWriter.WriteToStream;
  334. begin
  335. WriteHeader(True);
  336. ProcessWords;
  337. WriteHeader(False);
  338. end;
  339. constructor TChmSearchWriter.Create ( AStream: TStream;
  340. AWordList: TIndexedWordList ) ;
  341. begin
  342. FStream := AStream;
  343. FWordList := AWordList;
  344. FActiveLeafNode:=NIL;
  345. end;
  346. destructor TChmSearchWriter.Destroy;
  347. begin
  348. freeandnil(FActiveLeafNode);
  349. inherited;
  350. end;
  351. { TLeafNode }
  352. function TFIftiNode.RemainingSpace: DWord;
  353. begin
  354. Result := FIFTI_NODE_SIZE - FBlockStream.Position;
  355. end;
  356. constructor TFIftiNode.Create ( AStream: TStream ) ;
  357. begin
  358. inherited Create;
  359. FWriteStream := AStream;
  360. FBlockStream := TMemoryStream.Create;
  361. OwnsParentNode :=false;
  362. end;
  363. destructor TFIftiNode.Destroy;
  364. begin
  365. FBlockStream.Free;
  366. if OwnsParentNode then ParentNode.Free;
  367. inherited Destroy;
  368. end;
  369. procedure TFIftiNode.FillRemainingSpace;
  370. begin
  371. while RemainingSpace > 0 do
  372. FBlockStream.WriteByte(0);
  373. end;
  374. function TFIftiNode.AdjustedWord ( AWord: String; out AOffset: Byte; AOldWord: String ) : String;
  375. var
  376. Count1,
  377. Count2: Integer;
  378. Count: Integer;
  379. i: Integer;
  380. begin
  381. if AWord = AOldWord then
  382. begin
  383. AOffset := Length(AWord);
  384. Exit('');
  385. end;
  386. // else
  387. Count1 := Length(AOldWord);
  388. Count2 := Length(AWord);
  389. if Count1<Count2 then
  390. Count := Count1
  391. else
  392. Count := Count2;
  393. for i := 1 to Count do
  394. begin
  395. AOffset := i-1;
  396. if AOldWord[i] <> AWord[i]
  397. then Exit(Copy(AWord, i, Length(AWord)));
  398. end;
  399. Result := AWord;
  400. AOffset := 0;
  401. end;
  402. procedure TLeafNode.WriteInitialHeader;
  403. begin
  404. FBlockStream.WriteDWord(0);
  405. FBlockStream.WriteWord(0);
  406. FBlockStream.WriteWord(0);
  407. end;
  408. destructor TLeafNode.Destroy;
  409. begin
  410. inherited Destroy;
  411. end;
  412. function TLeafNode.GuessIfCanHold ( AWord: String ) : Boolean;
  413. var
  414. WordOffset: Byte;
  415. begin
  416. Result := 17 + Length(AdjustedWord(AWord, WordOffset, FLastWord)) < RemainingSpace;
  417. end;
  418. procedure TLeafNode.Flush(NewBlockNeeded: Boolean);
  419. var
  420. FTmpPos: DWord;
  421. begin
  422. Inc(FLeafNodeCount);
  423. FTmpPos := FWriteStream.Position;
  424. // update the previous leaf node about our position.
  425. if FLastNodeStart > 0 then
  426. begin
  427. FWriteStream.Position := FLastNodeStart;
  428. FWriteStream.WriteDWord(NtoLE(FTmpPos));
  429. FWriteStream.Position := FTmpPos;
  430. end;
  431. FLastNodeStart := FTmpPos;
  432. FreeSpace := RemainingSpace;
  433. FillRemainingSpace;
  434. // update the leaf header to show the available space.
  435. FBlockStream.Position := 6;
  436. FBlockStream.WriteWord(NtoLE(Word(FreeSpace)));
  437. // copy the leaf block to the fiftimain file
  438. FBlockStream.Position := 0;
  439. FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
  440. FBlockStream.Position := 0;
  441. if NewBlockNeeded or ((NewBlockNeeded = False) and (ParentNode <> nil)) then
  442. begin
  443. if ParentNode = nil then
  444. begin
  445. ParentNode := TIndexNode.Create(FWriteStream);
  446. OwnsParentNode:=True;
  447. end;
  448. ParentNode.ChildIsFull(FLastWord, FLastNodeStart);
  449. if (NewBlockNeeded = False) then
  450. ParentNode.Flush(False);
  451. end;
  452. FLastWord := '';
  453. end;
  454. procedure TLeafNode.AddWord ( AWord: TIndexedWord ) ;
  455. var
  456. Offset: Byte;
  457. NewWord: String;
  458. WLCSize: DWord;
  459. begin
  460. if Length(AWord.TheWord) > 99 then
  461. Exit; // Maximum word length is 99
  462. if FBlockStream.Position = 0 then
  463. WriteInitialHeader;
  464. NewWord := AdjustedWord(AWord.TheWord, Offset, FLastWord);
  465. FLastWord := AWord.TheWord;
  466. FBlockStream.WriteByte(Length(NewWord)+1);
  467. FBlockStream.WriteByte(Offset);
  468. // length can be 0 if it is the same word as the last. there is a word entry each for title and content
  469. if Length(NewWord) > 0 then
  470. FBlockStream.Write(NewWord[1], Length(NewWord));
  471. FBlockStream.WriteByte(Ord(AWord.IsTitle));
  472. WriteCompressedIntegerBE(FBlockStream, AWord.DocumentCount);
  473. FBlockStream.WriteDWord(NtoLE(DWord(FWriteStream.Position)));
  474. FBlockStream.WriteWord(0);
  475. // write WLC to FWriteStream so we can write the size of the wlc entries
  476. WLCSize := WriteWLCEntries(AWord, FDocRootSize, FCodeRootSize, FLocRootSize);
  477. WriteCompressedIntegerBE(FBlockStream, WLCSize);
  478. if FBlockStream.Position > FIFTI_NODE_SIZE then
  479. raise Exception.Create('FIFTIMAIN Leaf node has written past the block!');
  480. end;
  481. function Min(AValue, BValue: Byte): Byte;
  482. begin
  483. if AValue < BValue then
  484. Result := AValue
  485. else Result := BValue;
  486. end;
  487. function Max(AValue, BValue: Byte): Byte;
  488. begin
  489. if AValue > BValue then
  490. Result := AValue
  491. else Result := BValue;
  492. end;
  493. function Max(AValue, BValue: Integer): Integer;
  494. begin
  495. if AValue > BValue then
  496. Result := AValue
  497. else Result := BValue;
  498. end;
  499. function Max(AValue, BValue: DWord): DWord;
  500. begin
  501. if AValue > BValue then
  502. Result := AValue
  503. else Result := BValue;
  504. end;
  505. function TLeafNode.WriteWLCEntries ( AWord: TIndexedWord ; ADocRootSize, ACodeRootSize, ALocRootSize: Byte) : DWord;
  506. var
  507. LastDocIndex: DWord;
  508. LastLocCode: DWord;
  509. UsedBits: Byte;
  510. Buf: Byte;
  511. function NewDocDelta(ADocIndex: DWord): DWord;
  512. begin
  513. Result := ADocIndex - LastDocIndex;
  514. LastDocIndex := ADocIndex;
  515. end;
  516. function NewLocCode(ALocCode: DWord): DWord;
  517. begin
  518. Result := ALocCode - LastLocCode;
  519. LastLocCode := ALocCode;
  520. end;
  521. procedure AddValue(AValue: DWord; BitCount: Byte);
  522. var
  523. NeededBits: Byte;
  524. Tmp: Byte;
  525. begin
  526. AValue := AValue shl (32 - BitCount);
  527. while BitCount > 0 do
  528. begin
  529. NeededBits := 8 - UsedBits;
  530. Tmp := Hi(Hi(DWord(AValue shr (UsedBits))));
  531. Buf := Buf or Tmp;
  532. Inc(UsedBits, Min(BitCount, NeededBits));
  533. AValue := AValue shl Min(BitCount, NeededBits);
  534. Dec(BitCount, Min(BitCount, NeededBits));
  535. if (UsedBits = 8) then
  536. begin
  537. FWriteStream.WriteByte(Buf);
  538. UsedBits := 0;
  539. NeededBits := 0;
  540. Buf := 0;
  541. end;
  542. end;
  543. end;
  544. procedure FlushBuffer;
  545. begin
  546. if UsedBits > 0 then
  547. FWriteStream.WriteByte(Buf);
  548. UsedBits := 0;
  549. Buf := 0;
  550. end;
  551. var
  552. DocDelta: DWord;
  553. LocDelta: DWord;
  554. StartPos: DWord;
  555. Bits: DWord;
  556. BitCount: Byte;
  557. i,
  558. j: Integer;
  559. Doc: TIndexDocument;
  560. // proced
  561. begin
  562. StartPos := FWriteStream.Position;
  563. LastDocIndex := 0;
  564. UsedBits := 0;
  565. Buf := 0;
  566. for i := 0 to AWord.DocumentCount-1 do
  567. begin
  568. LastLocCode := 0;
  569. Doc := AWord.GetLogicalDocument(i);
  570. DocDelta := NewDocDelta(Doc.DocumentIndex);
  571. BitCount := WriteScaleRootInt(DocDelta, Bits, ADocRootSize);
  572. AddValue(Bits, BitCount);
  573. BitCount := WriteScaleRootInt(Doc.NumberOfIndexEntries, Bits, ACodeRootSize);
  574. AddValue(Bits, BitCount);
  575. for j := 0 to Doc.NumberOfIndexEntries-1 do
  576. begin
  577. LocDelta := NewLocCode(Doc.IndexEntry[j]);
  578. BitCount := WriteScaleRootInt(LocDelta, Bits, ALocRootSize);
  579. AddValue(Bits, BitCount);
  580. end;
  581. FlushBuffer;
  582. end;
  583. Result := FWriteStream.Position-StartPos;
  584. end;
  585. { TIndexNode }
  586. function TIndexNode.GuessIfCanHold ( AWord: String ) : Boolean;
  587. var
  588. Offset: Byte;
  589. begin
  590. Result := FBlockStream.Position + 8 + Length(AdjustedWord(AWord, Offset, FLastWord)) < FIFTI_NODE_SIZE;
  591. end;
  592. procedure TIndexNode.ChildIsFull ( AWord: String; ANodeOffset: DWord ) ;
  593. var
  594. Offset: Byte;
  595. NewWord: String;
  596. begin
  597. if FBlockStream.Position = 0 then
  598. FBlockStream.WriteWord(0); // free space at end. updated when the block is flushed
  599. if GuessIfCanHold(AWord) = False then
  600. Flush(True);
  601. NewWord := AdjustedWord(AWord, Offset, FLastWord);
  602. FLastWord:=AWord;
  603. // Write the Index node Entry
  604. FBlockStream.WriteByte(Length(NewWord)+1);
  605. FBlockStream.WriteByte(Offset);
  606. FBlockStream.Write(NewWord[1], Length(NewWord));
  607. FBlockStream.WriteDWord(NtoLE(ANodeOffset));
  608. FBlockStream.WriteWord(0);
  609. if FBlockStream.Position > FIFTI_NODE_SIZE then
  610. raise Exception.Create('FIFTIMAIN Index node has written past the block!');
  611. end;
  612. procedure TIndexNode.Flush ( NewBlockNeeded: Boolean ) ;
  613. var
  614. RemSize: DWord;
  615. begin
  616. if NewBlockNeeded then
  617. begin
  618. if ParentNode = nil then
  619. begin
  620. ParentNode := TIndexNode.Create(FWriteStream);
  621. OwnsParentNode:=True;
  622. end;
  623. end;
  624. if ParentNode <> nil then
  625. ParentNode.ChildIsFull(FLastWord, FWriteStream.Position);
  626. RemSize := RemainingSpace;
  627. FillRemainingSpace;
  628. FBlockStream.Position := 0;
  629. FBlockStream.WriteWord(NtoLE(RemSize));
  630. FBlockStream.Position := 0;
  631. FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
  632. FBlockStream.Position := 0;
  633. FLastWord := '';
  634. if NewBlockNeeded then
  635. FBlockStream.WriteDWord(0) // placeholder to write free space in when block is full
  636. else
  637. if ParentNode <> nil then
  638. ParentNode.Flush(NewBlockNeeded);
  639. end;
  640. { TChmSearchReader }
  641. procedure TChmSearchReader.ReadCommonData;
  642. var
  643. Sig: DWord;
  644. begin
  645. FStream.Position := 0;
  646. Sig := LEtoN(FStream.ReadDWord);
  647. FFileIsValid := Sig = $00280000;
  648. if not FileIsValid then
  649. Exit;
  650. // root node address
  651. FStream.Position := $8;
  652. FRootNodeOffset := LEtoN(FStream.ReadDWord);
  653. // Tree Depth
  654. FStream.Position := $18;
  655. FTreeDepth := LEtoN(FStream.ReadWord);
  656. // Root sizes for scale and root integers
  657. FStream.Position := $1E;
  658. if FStream.ReadByte <> 2 then // we only can read the files when scale is 2
  659. FFileIsValid := False;
  660. FDocRootSize := FStream.ReadByte;
  661. if FStream.ReadByte <> 2 then
  662. FFileIsValid := False;
  663. FCodeCountRootSize := FStream.ReadByte;
  664. if FStream.ReadByte <> 2 then
  665. FFileIsValid := False;
  666. FLocCodeRootSize := FStream.ReadByte;
  667. end;
  668. procedure TChmSearchReader.MoveToFirstLeafNode;
  669. var
  670. NodeDepth: Integer;
  671. NodeOffset: DWord;
  672. LastWord: String;
  673. NewWord: String;
  674. begin
  675. NodeDepth := FTreeDepth;
  676. MoveToRootNode;
  677. while NodeDepth > 1 do
  678. begin
  679. LastWord := '';
  680. ReadIndexNodeEntry(LastWord, NewWord, NodeOffset);
  681. Dec(NodeDepth);
  682. MoveToNode(NodeOffset, NodeDepth);
  683. end;
  684. end;
  685. procedure TChmSearchReader.MoveToRootNode;
  686. begin
  687. MoveToNode(FRootNodeOffset, FTreeDepth);
  688. end;
  689. procedure TChmSearchReader.MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
  690. begin
  691. FStream.Position := ANodeOffset;
  692. FActiveNodeStart := FStream.Position;
  693. if ANodeDepth > 1 then
  694. begin
  695. FnextLeafNode := 0;
  696. FActiveNodeFreeSpace := LEtoN(FStream.ReadWord); // empty space at end of node
  697. end
  698. else
  699. begin
  700. FnextLeafNode := LEtoN(FStream.ReadDWord);
  701. FStream.ReadWord;
  702. FActiveNodeFreeSpace := LEtoN(FStream.ReadWord);
  703. end;
  704. end;
  705. function TChmSearchReader.ReadWordOrPartialWord ( ALastWord: String ) : String;
  706. var
  707. WordLength: Integer;
  708. CopyLastWordCharCount: Integer;
  709. begin
  710. WordLength := FStream.ReadByte;
  711. CopyLastWordCharCount := FStream.ReadByte;
  712. if CopyLastWordCharCount > 0 then
  713. Result := Copy(ALastWord, 1, CopyLastWordCharCount);
  714. SetLength(Result, (WordLength-1) + CopyLastWordCharCount);
  715. if WordLength > 1 then
  716. FStream.Read(Result[1+CopyLastWordCharCount], WordLength-1);
  717. end;
  718. function TChmSearchReader.ReadIndexNodeEntry (ALastWord: String; out AWord: String; out
  719. ASubNodeStart: DWord ): Boolean;
  720. begin
  721. Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  722. if not Result then
  723. Exit;
  724. AWord := ReadWordOrPartialWord(ALastWord);
  725. ASubNodeStart := LEtoN(FStream.ReadDWord);
  726. FStream.ReadWord;
  727. end;
  728. function TChmSearchReader.ReadLeafNodeEntry ( ALastWord: String; out
  729. AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out
  730. AWLCOffset: DWord; out AWLCSize: DWord ): Boolean;
  731. begin
  732. Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  733. if not Result then
  734. Exit;
  735. AWord := ReadWordOrPartialWord(ALastWord);
  736. AInTitle := FStream.ReadByte = 1;
  737. AWLCCount := GetCompressedIntegerBE(FStream);
  738. AWLCOffset := LEtoN(FStream.ReadDWord);
  739. FStream.ReadWord;
  740. AWLCSize := GetCompressedIntegerBE(FStream);
  741. end;
  742. function TChmSearchReader.ReadWLCEntries (AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord ) : TChmWLCTopicArray;
  743. var
  744. Buf: Byte;
  745. BitsInBuffer: Integer;
  746. FinalPosition: int64;
  747. function GetNextByte(): Boolean;
  748. begin
  749. Result := (FStream.Position < FinalPosition);
  750. if Result then
  751. begin
  752. Buf := FStream.ReadByte;
  753. Inc(BitsInBuffer, 8);
  754. end;
  755. end;
  756. function ShiftBuffer: Boolean;
  757. begin
  758. Buf := (Buf and $7F) shl 1;
  759. Dec(BitsInBuffer);
  760. Result := (BitsInBuffer > 0) or GetNextByte();
  761. end;
  762. function ReadWLC(RootSize: DWord): DWord;
  763. var
  764. PrefixBits: Integer = 0;
  765. RemainingBits: Integer; // only the bits for this number not the bits in buffer
  766. begin
  767. if (BitsInBuffer = 0) then
  768. GetNextByte();
  769. Result := (Buf and $80) shr 7;
  770. while (Buf and $80) > 0 do // find out how many prefix bits there are
  771. begin
  772. Inc(PrefixBits);
  773. if not ShiftBuffer then
  774. Exit;
  775. end;
  776. // skip divider (zero) bit
  777. if not ShiftBuffer then
  778. Exit;
  779. Remainingbits := RootSize + Max(Integer(PrefixBits-1), 0);
  780. while RemainingBits > 0 do
  781. begin
  782. Result := Result shl 1;
  783. Result := Result or (Buf shr 7);
  784. Dec(RemainingBits);
  785. if not ShiftBuffer then
  786. Exit;
  787. end;
  788. end;
  789. procedure ClearBuffer;
  790. begin
  791. if BitsInBuffer < 8 then
  792. begin
  793. BitsInBuffer := 0;
  794. Buf := 0;
  795. end;
  796. end;
  797. var
  798. TopicHits: DWord;
  799. i: Integer;
  800. j: Integer;
  801. CachedStreamPos: QWord;
  802. LastDoc,
  803. LastLocCode: DWord;
  804. begin
  805. FinalPosition := AWLCOffset + AWLCSize;
  806. CachedStreamPos := FStream.Position;
  807. FStream.Position := AWLCOffset;
  808. {for i := 0 to AWLCSize-1 do
  809. begin
  810. Buf := FStream.ReadByte;
  811. Write(binStr(Buf, 8), ' ');
  812. end;}
  813. FStream.Position := AWLCOffset;
  814. SetLength(Result, AWLCCount);
  815. Buf := 0;
  816. BitsInBuffer := 0;
  817. LastDoc := 0;
  818. for i := 0 to AWLCCount-1 do
  819. begin
  820. Result[i].TopicIndex := ReadWLC(FDocRootSize) + LastDoc;
  821. LastDoc := Result[i].TopicIndex;
  822. TopicHits := ReadWLC(FCodeCountRootSize);
  823. SetLength(Result[i].LocationCodes, TopicHits);
  824. LastLocCode := 0;
  825. for j := 0 to TopicHits-1 do
  826. begin
  827. Result[i].LocationCodes[j] := ReadWLC(FLocCodeRootSize) + LastLocCode;
  828. LastLocCode := Result[i].LocationCodes[j];
  829. end;
  830. ClearBuffer;
  831. end;
  832. FStream.Position := CachedStreamPos;
  833. end;
  834. constructor TChmSearchReader.Create ( AStream: TStream;
  835. AFreeStreamOnDestroy: Boolean ) ;
  836. begin
  837. FStream := AStream;
  838. FFreeStreamOnDestroy := AFreeStreamOnDestroy;
  839. ReadCommonData;
  840. end;
  841. destructor TChmSearchReader.Destroy;
  842. begin
  843. if FFreeStreamOnDestroy then
  844. FreeAndNil(FStream);
  845. inherited Destroy;
  846. end;
  847. procedure TChmSearchReader.DumpData (
  848. AFoundDataEvent: TChmSearchReaderFoundDataEvent ) ;
  849. var
  850. LastWord: String;
  851. TheWord: String;
  852. InTitle: Boolean;
  853. WLCCount: DWord;
  854. WLCOffset: DWord;
  855. WLCSize: DWord;
  856. FoundHits: TChmWLCTopicArray;
  857. i: Integer;
  858. j: Integer;
  859. begin
  860. MoveToFirstLeafNode;
  861. LastWord := '';
  862. repeat
  863. if (ReadLeafNodeEntry(LastWord, TheWord, InTitle, WLCCount, WLCOffset, WLCSize) = False) then
  864. begin
  865. if FnextLeafNode <> 0 then
  866. begin
  867. MoveToNode(FnextLeafNode, 1);
  868. LastWord := '';
  869. end
  870. else
  871. Break;
  872. end
  873. else begin
  874. LastWord := TheWord;
  875. //WriteLn('Reading Hits for ', TheWord ,' at ', hexstr(WLCOffset,8) );
  876. FoundHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  877. //WriteLn('DONE Reading Hits for ', TheWord);
  878. // AFoundDataEvent(Self, TheWord, 0,0);//FoundHits[i].TopicIndex ,-1);//FoundHits[i].LocationCodes[j]);
  879. for i := 0 to High(FoundHits) do
  880. for j := 0 to High(FoundHits[i].LocationCodes) do
  881. AFoundDataEvent(Self, TheWord, FoundHits[i].TopicIndex ,FoundHits[i].LocationCodes[j]);
  882. end;
  883. until False; //FStream.Position - FActiveNodeStart >= FIFTI_NODE_SIZE - FActiveNodeFreeSpace
  884. end;
  885. function TChmSearchReader.LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray;
  886. var
  887. LastWord: String;
  888. NewWord: String;
  889. NodeLevel: Integer;
  890. NewNodePosition: DWord;
  891. InTitle: Boolean;
  892. WLCCount: DWord;
  893. WLCOffset: DWord;
  894. WLCSize: DWord;
  895. CompareResult: Integer;
  896. ReadNextResult: Boolean;
  897. begin
  898. AWord := LowerCase(AWord);
  899. NodeLevel := FTreeDepth;
  900. MoveToRootNode;
  901. SetLength(Result, 0);
  902. LastWord := '';
  903. // descend the index node tree until we find the leafnode
  904. while NodeLevel > 1 do begin
  905. //WriteLn('At Node Level ', NodeLevel);
  906. if ReadIndexNodeEntry(LastWord, NewWord, NewNodePosition) <> False then
  907. begin
  908. LastWord := NewWord;
  909. //WriteLn('Found Index Entry: ', NewWord, ' Comparing to ', AWord);
  910. if ChmCompareText(NewWord, AWord) >= 0 then
  911. begin
  912. LastWord := '';
  913. Dec(NodeLevel);
  914. MoveToNode(NewNodePosition, NodeLevel);
  915. end;
  916. end
  917. else
  918. Break;
  919. end;
  920. if NodeLevel > 1 then
  921. Exit; // the entry we are looking for is > than the last entry of the last index node
  922. // now we are in a leafnode
  923. while ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize) <> False do
  924. begin
  925. //WriteLn('Found Leaf Entry: ', NewWord, ' Comparing to ', AWord);
  926. LastWord := NewWord;
  927. if Length(NewWord) < Length(AWord) then
  928. continue;
  929. if AStartsWith then //it only has to start with the searched term
  930. CompareResult := ChmCompareText(AWord, Copy(NewWord, 1, Length(AWord)))
  931. else // it must match exactly
  932. CompareResult := ChmCompareText(AWord, NewWord);
  933. if CompareResult < 0 then
  934. Exit;
  935. if CompareResult = 0 then
  936. begin
  937. if InTitle then
  938. ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
  939. else
  940. Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  941. // check if the next entry is the same word since there is an entry for titles and for body
  942. if (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize)) then
  943. ReadNextResult := True
  944. else if (FNextLeafNode <> 0) then
  945. begin
  946. MoveToNode(FNextLeafNode, 1);
  947. LastWord := '';
  948. ReadNextResult := (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize));
  949. end;
  950. if ReadNextResult and (NewWord = AWord) then
  951. begin
  952. if InTitle then
  953. ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
  954. else
  955. Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  956. end;
  957. Exit;
  958. end;
  959. end;
  960. end;
  961. end.