chmfiftimain.pas 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093
  1. { Copyright (C) <2008> <Andrew Haines> chmfiftimain.pas
  2. This library is free software; you can redistribute it and/or modify it
  3. under the terms of the GNU Library General Public License as published by
  4. the Free Software Foundation; either version 2 of the License, or (at your
  5. option) any later version.
  6. This program is distributed in the hope that it will be useful, but WITHOUT
  7. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  8. FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
  9. for more details.
  10. You should have received a copy of the GNU Library General Public License
  11. along with this library; if not, write to the Free Software Foundation,
  12. Inc., i51 Franklin Street, Fifth Floor, Boston, MA 02111-1301, USA.
  13. }
  14. {
  15. See the file COPYING.FPC, included in this distribution,
  16. for details about the copyright.
  17. }
  18. unit chmFiftiMain;
  19. {$mode objfpc}{$H+}
  20. interface
  21. uses Classes, HTMLIndexer;
  22. type
  23. TFiftiMainHeader = record
  24. Sig: array [0..3] of byte; //$00,$00,$28,$00
  25. HTMLFilesCount: DWord;
  26. RootNodeOffset: DWord;
  27. Unknown1: DWord; // = 0
  28. LeafNodeCount: DWord;
  29. CopyOfRootNodeOffset: DWord;
  30. TreeDepth: Word;
  31. Unknown2: DWord; // = 7
  32. DocIndexScale: Byte;
  33. DocIndexRootSize: Byte;
  34. CodeCountScale: Byte;
  35. CodeCountRootSize: Byte;
  36. LocationCodeScale: Byte;
  37. LocationCodeRootSize: Byte;
  38. Unknown3: array[0..9] of byte; // = 0
  39. NodeSize: DWord; // 4096;
  40. Unknown4: DWord; // 0 or 1;
  41. LastDupWordIndex: DWord;
  42. LastDupCharIndex: DWord;
  43. LongestWordLength: DWord; // maximum 99
  44. TotalWordsIndexed: DWord; // includes duplicates
  45. TotalWords: DWord; // word count not including duplicates
  46. TotalWordsLengthPart1: DWord; // length of all the words with duplicates plus the next dword!
  47. TotalWordsLengthPart2: DWord;
  48. TotalWordsLength: DWord; // length of all words not including duplicates
  49. WordBlockUnusedBytes: DWord; // who knows, this makes no sense when there are more than one blocks
  50. Unknown5: DWord; // 0
  51. HTMLFilesCountMinusOne: DWord; // maybe
  52. Unknown6: array[0..23] of Byte; // 0
  53. WindowsCodePage: DWord; // usually 1252
  54. LocalID: DWord;
  55. //Unknown7: array [0..893] of Byte; // 0
  56. end;
  57. { TFIftiNode }
  58. TFIftiNode = class(TObject)
  59. FLastWord: String;
  60. FWriteStream: TStream;
  61. FBlockStream: TMemoryStream;
  62. ParentNode: TFIftiNode;
  63. OwnsParentNode : boolean;
  64. function AdjustedWord(AWord: String; out AOffset: Byte; AOldWord: String): String;
  65. procedure ChildIsFull(AWord: String; ANodeOffset: DWord); virtual; abstract;
  66. function GuessIfCanHold(AWord: String): Boolean; virtual; abstract;
  67. procedure Flush(NewBlockNeeded: Boolean); virtual; abstract;
  68. procedure FillRemainingSpace;
  69. function RemainingSpace: DWord;
  70. constructor Create(AStream: TStream);
  71. destructor Destroy; override;
  72. end;
  73. { TChmSearchWriter }
  74. TChmSearchWriter = class(TObject)
  75. private
  76. FHeaderRec: TFiftiMainHeader;
  77. FStream: TStream;
  78. FWordList: TIndexedWordList;
  79. FActiveLeafNode: TFIftiNode;
  80. function GetHasData: Boolean;
  81. procedure ProcessWords;
  82. procedure WriteHeader(IsPlaceHolder: Boolean);
  83. procedure WriteAWord(AWord: TIndexedWord);
  84. public
  85. procedure WriteToStream;
  86. property HasData: Boolean read GetHasData;
  87. constructor Create(AStream: TStream; AWordList: TIndexedWordList);
  88. destructor Destroy; override;
  89. end;
  90. { TChmSearchReader }
  91. TChmWLCTopic = record
  92. TopicIndex: DWord;
  93. LocationCodes: array of DWord;
  94. end;
  95. TChmWLCTopicArray = array of TChmWLCTopic;
  96. TChmSearchReader = class;
  97. TChmSearchReaderFoundDataEvent = procedure(Sender: TChmSearchReader; AWord: String; ATopic: DWord; AWordIndex: DWord) of object;
  98. TChmSearchReader = class(TObject)
  99. private
  100. FStream: TStream;
  101. FFileIsValid: Boolean;
  102. FFreeStreamOnDestroy: Boolean;
  103. FDocRootSize,
  104. FCodeCountRootSize,
  105. FLocCodeRootSize: Integer;
  106. FTreeDepth: Integer;
  107. FRootNodeOffset: DWord;
  108. FActiveNodeStart: DWord;
  109. FActiveNodeFreeSpace: Word;
  110. FNextLeafNode: DWord;
  111. procedure ReadCommonData;
  112. procedure MoveToFirstLeafNode;
  113. procedure MoveToRootNode;
  114. procedure MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
  115. function ReadWordOrPartialWord(ALastWord: String): String; // returns the whole word using the last word as a base
  116. function ReadIndexNodeEntry(ALastWord: String; out AWord: String; out ASubNodeStart: DWord): Boolean;
  117. function ReadLeafNodeEntry(ALastWord: String; out AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out AWLCOffset: DWord; out AWLCSize: DWord): Boolean;
  118. function ReadWLCEntries(AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord): TChmWLCTopicArray;
  119. public
  120. constructor Create(AStream: TStream; AFreeStreamOnDestroy: Boolean);
  121. destructor Destroy; override;
  122. procedure DumpData(AFoundDataEvent: TChmSearchReaderFoundDataEvent);
  123. function LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray;
  124. property FileIsValid: Boolean read FFileIsValid;
  125. end;
  126. const
  127. FIFTI_NODE_SIZE = 4096;
  128. implementation
  129. uses SysUtils, Math, ChmBase;
  130. type
  131. { TIndexNode }
  132. TIndexNode = class(TFIftiNode)
  133. function GuessIfCanHold(AWord: String): Boolean; override;
  134. procedure ChildIsFull ( AWord: String; ANodeOffset: DWord ); override;
  135. procedure Flush(NewBlockNeeded: Boolean); override;
  136. end;
  137. { TLeafNode }
  138. TLeafNode = class(TFIftiNode)
  139. FLeafNodeCount: DWord;
  140. FLastNodeStart: DWord;
  141. FreeSpace: DWord;
  142. FDocRootSize,
  143. FCodeRootSize,
  144. FLocRootSize: Byte;
  145. procedure WriteInitialHeader;
  146. Destructor Destroy; override;
  147. function GuessIfCanHold(AWord: String): Boolean; override;
  148. procedure Flush(NewBlockNeeded: Boolean); override;
  149. procedure AddWord(AWord: TIndexedWord);
  150. function WriteWLCEntries(AWord: TIndexedWord; ADocRootSize, ACodeRootSize, ALocRootSize: Byte): DWord;
  151. property LeafNodeCount: DWord read FLeafNodeCount;
  152. property DocRootSize: Byte read FDocRootSize write FDocRootSize;
  153. property CodeRootSize: Byte read FCodeRootSize write FCodeRootSize;
  154. property LocRootSize: Byte read FLocRootSize write FLocRootSize;
  155. end;
  156. function GetCompressedIntegerBE(Stream: TStream): DWord;
  157. var
  158. Buf: Byte;
  159. Value: Dword = 0;
  160. Shift: Integer = 0;
  161. begin
  162. repeat
  163. Buf := Stream.ReadByte;
  164. Value := Value or (Buf and $7F) shl Shift;
  165. Inc(Shift, 7);
  166. until (Buf and $80) = 0;
  167. Result := Value;
  168. end;
  169. procedure WriteCompressedIntegerBE(Stream: TStream; AInt: DWord);
  170. var
  171. Bits: Integer;
  172. Tmp: DWord;
  173. Buf: Byte;
  174. begin
  175. Tmp := AInt;
  176. Bits := 0;
  177. while Tmp <> 0 do
  178. begin
  179. Tmp := Tmp shr 1;
  180. Inc(Bits);
  181. end;
  182. repeat
  183. Buf := (AInt shr (Tmp * 7)) and $7F;
  184. if Bits > 7 then
  185. Buf := Buf or $80;
  186. Dec(Bits, 7);
  187. Inc(Tmp);
  188. Stream.WriteByte(Buf);
  189. until Bits <= 0;
  190. end;
  191. function WriteScaleRootInt(ANumber: DWord; out Bits: DWord; Root: Integer): Byte;
  192. var
  193. Tmp: DWord;
  194. Mask: DWord;
  195. // Scale: Integer;
  196. NeededBits: Integer;
  197. PrefixBits: Integer;
  198. RootBits: Integer;
  199. begin
  200. // Scale := 2;
  201. Bits := 0;
  202. Result := Root;
  203. Tmp := ANumber;
  204. NeededBits := 0;
  205. while Tmp <> 0 do
  206. begin
  207. Inc(NeededBits);
  208. Tmp := Tmp shr 1;
  209. end;
  210. PrefixBits := Max(0, NeededBits-Root);
  211. RootBits := NeededBits -1; //
  212. if RootBits < Root then
  213. RootBits := Root;
  214. if RootBits < 0 then
  215. RootBits := 0;
  216. Mask := 0;
  217. if RootBits-1 >= 0 then
  218. for Tmp := 0 to RootBits-1 do
  219. Mask := Mask or (DWord(1) shl Tmp);
  220. Bits := not Mask;
  221. Bits := Bits shl 1; // make space for empty bit
  222. Bits := Bits or (ANumber and Mask);
  223. Result := PrefixBits + 1 + RootBits;
  224. Bits := (Bits shl (32-Result)) shr (32 - Result);
  225. end;
  226. { TChmSearchWriter }
  227. procedure TChmSearchWriter.ProcessWords;
  228. begin
  229. FWordList.ForEach(@WriteAword);
  230. if FActiveLeafNode <> nil then
  231. FActiveLeafNode.Flush(False); // causes the unwritten parts of the tree to be written
  232. end;
  233. function TChmSearchWriter.GetHasData: Boolean;
  234. begin
  235. Result := FWordList.IndexedFileCount > 0;
  236. end;
  237. procedure TChmSearchWriter.WriteHeader ( IsPlaceHolder: Boolean ) ;
  238. var
  239. TmpNode: TFIftiNode;
  240. i: Integer;
  241. begin
  242. if IsPlaceHolder then
  243. begin
  244. FStream.Size := $400; // the header size. we will fill this after the nodes have been determined
  245. FStream.Position := $400;
  246. FillChar(PChar(TMemoryStream(FStream).Memory)^, $400, 0);
  247. FHeaderRec.DocIndexRootSize := 1;
  248. FHeaderRec.CodeCountRootSize := 1;
  249. FHeaderRec.LocationCodeRootSize := 4;
  250. Exit;
  251. end;
  252. // write the glorious header
  253. FHeaderRec.Sig[2] := $28;
  254. FHeaderRec.HTMLFilesCount := FWordList.IndexedFileCount;
  255. FHeaderRec.RootNodeOffset := FStream.Size-FIFTI_NODE_SIZE;
  256. FHeaderRec.LeafNodeCount := TLeafNode(FActiveLeafNode).LeafNodeCount;
  257. FHeaderRec.CopyOfRootNodeOffset := FHeaderRec.RootNodeOffset;
  258. FHeaderRec.TreeDepth := 0;
  259. TmpNode := FActiveLeafNode;
  260. while TmpNode <> nil do
  261. begin
  262. Inc(FHeaderRec.TreeDepth);
  263. TmpNode := TmpNode.ParentNode;
  264. end;
  265. FHeaderRec.DocIndexScale := 2;
  266. FHeaderRec.CodeCountScale := 2;
  267. FHeaderRec.LocationCodeScale := 2;
  268. //FHeaderRec.DocIndexRootSize := 15;
  269. //FHeaderRec.CodeCountRootSize := 15;
  270. //FHeaderRec.LocationCodeRootSize := 15;
  271. FHeaderRec.NodeSize := FIFTI_NODE_SIZE;
  272. FHeaderRec.LongestWordLength := FWordList.LongestWord;
  273. FHeaderRec.TotalWordsIndexed := FWordList.TotalWordCount;
  274. FHeaderRec.TotalWords := FWordList.TotalDIfferentWords;
  275. FHeaderRec.TotalWordsLengthPart1 := FWordList.TotalWordLength;
  276. FHeaderRec.TotalWordsLength := FWordList.TotalDifferentWordLength;
  277. FHeaderRec.WindowsCodePage := 1252;
  278. FStream.Position := 0;
  279. FStream.Write(FHeaderRec.Sig[0], 4);
  280. FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount));
  281. FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset));
  282. FStream.WriteDWord(NtoLE(0)); // unknown 1
  283. FStream.WriteDWord(NtoLE(FHeaderRec.LeafNodeCount));
  284. FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset)); // yes twice
  285. FStream.WriteWord(NtoLE(FHeaderRec.TreeDepth));
  286. FStream.WriteDWord(NtoLE(DWord(7)));
  287. FStream.WriteByte(2);
  288. FStream.WriteByte(FHeaderRec.DocIndexRootSize);
  289. FStream.WriteByte(2);
  290. FStream.WriteByte(FHeaderRec.CodeCountRootSize);
  291. FStream.WriteByte(2);
  292. FStream.WriteByte(FHeaderRec.LocationCodeRootSize);
  293. // eat 10 bytes
  294. FStream.WriteWord(0);
  295. FStream.WriteDWord(0);
  296. FStream.WriteDWord(0);
  297. FStream.WriteDWord(NtoLE(FHeaderRec.NodeSize));
  298. FStream.WriteDWord(NtoLE(DWord(0)));
  299. FStream.WriteDWord(1);
  300. FStream.WriteDWord(5);
  301. FStream.WriteDWord(NtoLE(FHeaderRec.LongestWordLength));
  302. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsIndexed));
  303. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWords));
  304. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart1));
  305. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart2));
  306. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLength));
  307. FStream.WriteDWord(NtoLE(TLeafNode(FActiveLeafNode).FreeSpace));
  308. FStream.WriteDWord(NtoLE(0));
  309. FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount-1));
  310. for i := 0 to 23 do FStream.WriteByte(0);
  311. FStream.WriteDWord(NtoLE(FHeaderRec.WindowsCodePage));
  312. FStream.WriteDWord(NtoLE(DWord(1033))); // LCID
  313. for i := 0 to 893 do FStream.WriteByte(0);
  314. end;
  315. procedure TChmSearchWriter.WriteAWord ( AWord: TIndexedWord ) ;
  316. begin
  317. if FActiveLeafNode = nil then
  318. begin
  319. FActiveLeafNode := TLeafNode.Create(FStream);
  320. with TLeafNode(FActiveLeafNode) do
  321. begin
  322. DocRootSize := FHeaderRec.DocIndexRootSize;
  323. CodeRootSize := FHeaderRec.CodeCountRootSize;
  324. LocRootSize := FHeaderRec.LocationCodeRootSize;
  325. end;
  326. end;
  327. if FActiveLeafNode.GuessIfCanHold(AWord.TheWord) = False then
  328. begin
  329. FActiveLeafNode.Flush(True);
  330. end;
  331. TLeafNode(FActiveLeafNode).AddWord(AWord);
  332. end;
  333. procedure TChmSearchWriter.WriteToStream;
  334. begin
  335. WriteHeader(True);
  336. ProcessWords;
  337. WriteHeader(False);
  338. end;
  339. constructor TChmSearchWriter.Create ( AStream: TStream;
  340. AWordList: TIndexedWordList ) ;
  341. begin
  342. FStream := AStream;
  343. FWordList := AWordList;
  344. FActiveLeafNode:=NIL;
  345. end;
  346. destructor TChmSearchWriter.Destroy;
  347. begin
  348. freeandnil(FActiveLeafNode);
  349. end;
  350. { TLeafNode }
  351. function TFIftiNode.RemainingSpace: DWord;
  352. begin
  353. Result := FIFTI_NODE_SIZE - FBlockStream.Position;
  354. end;
  355. constructor TFIftiNode.Create ( AStream: TStream ) ;
  356. begin
  357. inherited Create;
  358. FWriteStream := AStream;
  359. FBlockStream := TMemoryStream.Create;
  360. OwnsParentNode :=false;
  361. end;
  362. destructor TFIftiNode.Destroy;
  363. begin
  364. FBlockStream.Free;
  365. if OwnsParentNode then ParentNode.Free;
  366. inherited Destroy;
  367. end;
  368. procedure TFIftiNode.FillRemainingSpace;
  369. begin
  370. while RemainingSpace > 0 do
  371. FBlockStream.WriteByte(0);
  372. end;
  373. function TFIftiNode.AdjustedWord ( AWord: String; out AOffset: Byte; AOldWord: String ) : String;
  374. var
  375. Count1,
  376. Count2: Integer;
  377. Count: Integer;
  378. i: Integer;
  379. begin
  380. if AWord = AOldWord then
  381. begin
  382. AOffset := Length(AWord);
  383. Exit('');
  384. end;
  385. // else
  386. Count1 := Length(AOldWord);
  387. Count2 := Length(AWord);
  388. if Count1<Count2 then
  389. Count := Count1
  390. else
  391. Count := Count2;
  392. for i := 1 to Count do
  393. begin
  394. AOffset := i-1;
  395. if AOldWord[i] <> AWord[i]
  396. then Exit(Copy(AWord, i, Length(AWord)));
  397. end;
  398. Result := AWord;
  399. AOffset := 0;
  400. end;
  401. procedure TLeafNode.WriteInitialHeader;
  402. begin
  403. FBlockStream.WriteDWord(0);
  404. FBlockStream.WriteWord(0);
  405. FBlockStream.WriteWord(0);
  406. end;
  407. destructor TLeafNode.Destroy;
  408. begin
  409. inherited Destroy;
  410. end;
  411. function TLeafNode.GuessIfCanHold ( AWord: String ) : Boolean;
  412. var
  413. WordOffset: Byte;
  414. begin
  415. Result := 17 + Length(AdjustedWord(AWord, WordOffset, FLastWord)) < RemainingSpace;
  416. end;
  417. procedure TLeafNode.Flush(NewBlockNeeded: Boolean);
  418. var
  419. FTmpPos: DWord;
  420. begin
  421. Inc(FLeafNodeCount);
  422. FTmpPos := FWriteStream.Position;
  423. // update the previous leaf node about our position.
  424. if FLastNodeStart > 0 then
  425. begin
  426. FWriteStream.Position := FLastNodeStart;
  427. FWriteStream.WriteDWord(NtoLE(FTmpPos));
  428. FWriteStream.Position := FTmpPos;
  429. end;
  430. FLastNodeStart := FTmpPos;
  431. FreeSpace := RemainingSpace;
  432. FillRemainingSpace;
  433. // update the leaf header to show the available space.
  434. FBlockStream.Position := 6;
  435. FBlockStream.WriteWord(NtoLE(Word(FreeSpace)));
  436. // copy the leaf block to the fiftimain file
  437. FBlockStream.Position := 0;
  438. FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
  439. FBlockStream.Position := 0;
  440. if NewBlockNeeded or ((NewBlockNeeded = False) and (ParentNode <> nil)) then
  441. begin
  442. if ParentNode = nil then
  443. begin
  444. ParentNode := TIndexNode.Create(FWriteStream);
  445. OwnsParentNode:=True;
  446. end;
  447. ParentNode.ChildIsFull(FLastWord, FLastNodeStart);
  448. if (NewBlockNeeded = False) then
  449. ParentNode.Flush(False);
  450. end;
  451. FLastWord := '';
  452. end;
  453. procedure TLeafNode.AddWord ( AWord: TIndexedWord ) ;
  454. var
  455. Offset: Byte;
  456. NewWord: String;
  457. WLCSize: DWord;
  458. begin
  459. if Length(AWord.TheWord) > 99 then
  460. Exit; // Maximum word length is 99
  461. if FBlockStream.Position = 0 then
  462. WriteInitialHeader;
  463. NewWord := AdjustedWord(AWord.TheWord, Offset, FLastWord);
  464. FLastWord := AWord.TheWord;
  465. FBlockStream.WriteByte(Length(NewWord)+1);
  466. FBlockStream.WriteByte(Offset);
  467. // length can be 0 if it is the same word as the last. there is a word entry each for title and content
  468. if Length(NewWord) > 0 then
  469. FBlockStream.Write(NewWord[1], Length(NewWord));
  470. FBlockStream.WriteByte(Ord(AWord.IsTitle));
  471. WriteCompressedIntegerBE(FBlockStream, AWord.DocumentCount);
  472. FBlockStream.WriteDWord(NtoLE(DWord(FWriteStream.Position)));
  473. FBlockStream.WriteWord(0);
  474. // write WLC to FWriteStream so we can write the size of the wlc entries
  475. WLCSize := WriteWLCEntries(AWord, FDocRootSize, FCodeRootSize, FLocRootSize);
  476. WriteCompressedIntegerBE(FBlockStream, WLCSize);
  477. if FBlockStream.Position > FIFTI_NODE_SIZE then
  478. raise Exception.Create('FIFTIMAIN Leaf node has written past the block!');
  479. end;
  480. function Min(AValue, BValue: Byte): Byte;
  481. begin
  482. if AValue < BValue then
  483. Result := AValue
  484. else Result := BValue;
  485. end;
  486. function Max(AValue, BValue: Byte): Byte;
  487. begin
  488. if AValue > BValue then
  489. Result := AValue
  490. else Result := BValue;
  491. end;
  492. function Max(AValue, BValue: Integer): Integer;
  493. begin
  494. if AValue > BValue then
  495. Result := AValue
  496. else Result := BValue;
  497. end;
  498. function Max(AValue, BValue: DWord): DWord;
  499. begin
  500. if AValue > BValue then
  501. Result := AValue
  502. else Result := BValue;
  503. end;
  504. function TLeafNode.WriteWLCEntries ( AWord: TIndexedWord ; ADocRootSize, ACodeRootSize, ALocRootSize: Byte) : DWord;
  505. var
  506. LastDocIndex: DWord;
  507. LastLocCode: DWord;
  508. UsedBits: Byte;
  509. Buf: Byte;
  510. function NewDocDelta(ADocIndex: DWord): DWord;
  511. begin
  512. Result := ADocIndex - LastDocIndex;
  513. LastDocIndex := ADocIndex;
  514. end;
  515. function NewLocCode(ALocCode: DWord): DWord;
  516. begin
  517. Result := ALocCode - LastLocCode;
  518. LastLocCode := ALocCode;
  519. end;
  520. procedure AddValue(AValue: DWord; BitCount: Byte);
  521. var
  522. NeededBits: Byte;
  523. Tmp: Byte;
  524. begin
  525. AValue := AValue shl (32 - BitCount);
  526. while BitCount > 0 do
  527. begin
  528. NeededBits := 8 - UsedBits;
  529. Tmp := Hi(Hi(DWord(AValue shr (UsedBits))));
  530. Buf := Buf or Tmp;
  531. Inc(UsedBits, Min(BitCount, NeededBits));
  532. AValue := AValue shl Min(BitCount, NeededBits);
  533. Dec(BitCount, Min(BitCount, NeededBits));
  534. if (UsedBits = 8) then
  535. begin
  536. FWriteStream.WriteByte(Buf);
  537. UsedBits := 0;
  538. NeededBits := 0;
  539. Buf := 0;
  540. end;
  541. end;
  542. end;
  543. procedure FlushBuffer;
  544. begin
  545. if UsedBits > 0 then
  546. FWriteStream.WriteByte(Buf);
  547. UsedBits := 0;
  548. Buf := 0;
  549. end;
  550. var
  551. DocDelta: DWord;
  552. LocDelta: DWord;
  553. StartPos: DWord;
  554. Bits: DWord;
  555. BitCount: Byte;
  556. i,
  557. j: Integer;
  558. Doc: TIndexDocument;
  559. // proced
  560. begin
  561. StartPos := FWriteStream.Position;
  562. LastDocIndex := 0;
  563. UsedBits := 0;
  564. Buf := 0;
  565. for i := 0 to AWord.DocumentCount-1 do
  566. begin
  567. LastLocCode := 0;
  568. Doc := AWord.GetLogicalDocument(i);
  569. DocDelta := NewDocDelta(Doc.DocumentIndex);
  570. BitCount := WriteScaleRootInt(DocDelta, Bits, ADocRootSize);
  571. AddValue(Bits, BitCount);
  572. BitCount := WriteScaleRootInt(Doc.NumberOfIndexEntries, Bits, ACodeRootSize);
  573. AddValue(Bits, BitCount);
  574. for j := 0 to Doc.NumberOfIndexEntries-1 do
  575. begin
  576. LocDelta := NewLocCode(Doc.IndexEntry[j]);
  577. BitCount := WriteScaleRootInt(LocDelta, Bits, ALocRootSize);
  578. AddValue(Bits, BitCount);
  579. end;
  580. FlushBuffer;
  581. end;
  582. Result := FWriteStream.Position-StartPos;
  583. end;
  584. { TIndexNode }
  585. function TIndexNode.GuessIfCanHold ( AWord: String ) : Boolean;
  586. var
  587. Offset: Byte;
  588. begin
  589. Result := FBlockStream.Position + 8 + Length(AdjustedWord(AWord, Offset, FLastWord)) < FIFTI_NODE_SIZE;
  590. end;
  591. procedure TIndexNode.ChildIsFull ( AWord: String; ANodeOffset: DWord ) ;
  592. var
  593. Offset: Byte;
  594. NewWord: String;
  595. begin
  596. if FBlockStream.Position = 0 then
  597. FBlockStream.WriteWord(0); // free space at end. updated when the block is flushed
  598. if GuessIfCanHold(AWord) = False then
  599. Flush(True);
  600. NewWord := AdjustedWord(AWord, Offset, FLastWord);
  601. FLastWord:=AWord;
  602. // Write the Index node Entry
  603. FBlockStream.WriteByte(Length(NewWord)+1);
  604. FBlockStream.WriteByte(Offset);
  605. FBlockStream.Write(NewWord[1], Length(NewWord));
  606. FBlockStream.WriteDWord(NtoLE(ANodeOffset));
  607. FBlockStream.WriteWord(0);
  608. if FBlockStream.Position > FIFTI_NODE_SIZE then
  609. raise Exception.Create('FIFTIMAIN Index node has written past the block!');
  610. end;
  611. procedure TIndexNode.Flush ( NewBlockNeeded: Boolean ) ;
  612. var
  613. RemSize: DWord;
  614. begin
  615. if NewBlockNeeded then
  616. begin
  617. if ParentNode = nil then
  618. begin
  619. ParentNode := TIndexNode.Create(FWriteStream);
  620. OwnsParentNode:=True;
  621. end;
  622. end;
  623. if ParentNode <> nil then
  624. ParentNode.ChildIsFull(FLastWord, FWriteStream.Position);
  625. RemSize := RemainingSpace;
  626. FillRemainingSpace;
  627. FBlockStream.Position := 0;
  628. FBlockStream.WriteWord(NtoLE(RemSize));
  629. FBlockStream.Position := 0;
  630. FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
  631. FBlockStream.Position := 0;
  632. FLastWord := '';
  633. if NewBlockNeeded then
  634. FBlockStream.WriteDWord(0) // placeholder to write free space in when block is full
  635. else
  636. if ParentNode <> nil then
  637. ParentNode.Flush(NewBlockNeeded);
  638. end;
  639. { TChmSearchReader }
  640. procedure TChmSearchReader.ReadCommonData;
  641. var
  642. Sig: DWord;
  643. begin
  644. FStream.Position := 0;
  645. Sig := LEtoN(FStream.ReadDWord);
  646. FFileIsValid := Sig = $00280000;
  647. if not FileIsValid then
  648. Exit;
  649. // root node address
  650. FStream.Position := $8;
  651. FRootNodeOffset := LEtoN(FStream.ReadDWord);
  652. // Tree Depth
  653. FStream.Position := $18;
  654. FTreeDepth := LEtoN(FStream.ReadWord);
  655. // Root sizes for scale and root integers
  656. FStream.Position := $1E;
  657. if FStream.ReadByte <> 2 then // we only can read the files when scale is 2
  658. FFileIsValid := False;
  659. FDocRootSize := FStream.ReadByte;
  660. if FStream.ReadByte <> 2 then
  661. FFileIsValid := False;
  662. FCodeCountRootSize := FStream.ReadByte;
  663. if FStream.ReadByte <> 2 then
  664. FFileIsValid := False;
  665. FLocCodeRootSize := FStream.ReadByte;
  666. end;
  667. procedure TChmSearchReader.MoveToFirstLeafNode;
  668. var
  669. NodeDepth: Integer;
  670. NodeOffset: DWord;
  671. LastWord: String;
  672. NewWord: String;
  673. begin
  674. NodeDepth := FTreeDepth;
  675. MoveToRootNode;
  676. while NodeDepth > 1 do
  677. begin
  678. LastWord := '';
  679. ReadIndexNodeEntry(LastWord, NewWord, NodeOffset);
  680. Dec(NodeDepth);
  681. MoveToNode(NodeOffset, NodeDepth);
  682. end;
  683. end;
  684. procedure TChmSearchReader.MoveToRootNode;
  685. begin
  686. MoveToNode(FRootNodeOffset, FTreeDepth);
  687. end;
  688. procedure TChmSearchReader.MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
  689. begin
  690. FStream.Position := ANodeOffset;
  691. FActiveNodeStart := FStream.Position;
  692. if ANodeDepth > 1 then
  693. begin
  694. FnextLeafNode := 0;
  695. FActiveNodeFreeSpace := LEtoN(FStream.ReadWord); // empty space at end of node
  696. end
  697. else
  698. begin
  699. FnextLeafNode := LEtoN(FStream.ReadDWord);
  700. FStream.ReadWord;
  701. FActiveNodeFreeSpace := LEtoN(FStream.ReadWord);
  702. end;
  703. end;
  704. function TChmSearchReader.ReadWordOrPartialWord ( ALastWord: String ) : String;
  705. var
  706. WordLength: Integer;
  707. CopyLastWordCharCount: Integer;
  708. begin
  709. WordLength := FStream.ReadByte;
  710. CopyLastWordCharCount := FStream.ReadByte;
  711. if CopyLastWordCharCount > 0 then
  712. Result := Copy(ALastWord, 1, CopyLastWordCharCount);
  713. SetLength(Result, (WordLength-1) + CopyLastWordCharCount);
  714. if WordLength > 1 then
  715. FStream.Read(Result[1+CopyLastWordCharCount], WordLength-1);
  716. end;
  717. function TChmSearchReader.ReadIndexNodeEntry (ALastWord: String; out AWord: String; out
  718. ASubNodeStart: DWord ): Boolean;
  719. begin
  720. Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  721. if not Result then
  722. Exit;
  723. AWord := ReadWordOrPartialWord(ALastWord);
  724. ASubNodeStart := LEtoN(FStream.ReadDWord);
  725. FStream.ReadWord;
  726. end;
  727. function TChmSearchReader.ReadLeafNodeEntry ( ALastWord: String; out
  728. AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out
  729. AWLCOffset: DWord; out AWLCSize: DWord ): Boolean;
  730. begin
  731. Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  732. if not Result then
  733. Exit;
  734. AWord := ReadWordOrPartialWord(ALastWord);
  735. AInTitle := FStream.ReadByte = 1;
  736. AWLCCount := GetCompressedIntegerBE(FStream);
  737. AWLCOffset := LEtoN(FStream.ReadDWord);
  738. FStream.ReadWord;
  739. AWLCSize := GetCompressedIntegerBE(FStream);
  740. end;
  741. function TChmSearchReader.ReadWLCEntries (AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord ) : TChmWLCTopicArray;
  742. function AtEndOfWLCEntries: Boolean;
  743. begin
  744. Result := AWLCOffset + AWLCSize <= FStream.Position;
  745. end;
  746. var
  747. Buf: Byte;
  748. BitsInBuffer: Integer;
  749. procedure FillBuffer;
  750. begin
  751. while (BitsInBuffer = 0) and not AtEndOfWLCEntries do
  752. begin
  753. Buf := FStream.ReadByte;
  754. Inc(BitsInBuffer, 8);
  755. end;
  756. end;
  757. function ReadWLC(RootSize: DWord): DWord;
  758. var
  759. PrefixBits: Integer = 0;
  760. BitCount: Integer = 0;
  761. RemainingBits: Integer; // only the bits for this number not the bits in buffer
  762. begin
  763. FillBuffer;
  764. Result := 0;
  765. while (Buf and $80) > 0 do // find out how many prefix bits there are
  766. begin
  767. Inc(PrefixBits);
  768. Buf := Buf shl 1;
  769. Dec(BitsInBuffer);
  770. FillBuffer;
  771. end;
  772. if PrefixBits > 0 then
  773. Result := 1;
  774. Inc(BitCount, PrefixBits+1);
  775. Buf := Buf shl 1;
  776. Dec(BitsInBuffer);
  777. FillBuffer;
  778. Remainingbits := RootSize + Max(Integer(PrefixBits-1), 0);
  779. while RemainingBits > 0 do
  780. begin
  781. Result := Result shl 1;
  782. Result := Result or (Buf shr 7);
  783. Dec(RemainingBits);
  784. Buf := Buf shl 1;
  785. Dec(BitsInBuffer);
  786. FillBuffer;
  787. Inc(BitCount);
  788. end;
  789. end;
  790. procedure ClearBuffer;
  791. begin
  792. BitsInBuffer := 0;
  793. Buf := 0;
  794. end;
  795. var
  796. TopicHits: DWord;
  797. i: Integer;
  798. j: Integer;
  799. CachedStreamPos: QWord;
  800. LastDoc,
  801. LastLocCode: DWord;
  802. begin
  803. CachedStreamPos := FStream.Position;
  804. FStream.Position := AWLCOffset;
  805. {for i := 0 to AWLCSize-1 do
  806. begin
  807. Buf := FStream.ReadByte;
  808. Write(binStr(Buf, 8), ' ');
  809. end;}
  810. FStream.Position := AWLCOffset;
  811. SetLength(Result, AWLCCount);
  812. Buf := 0;
  813. BitsInBuffer := 0;
  814. LastDoc := 0;
  815. for i := 0 to AWLCCount-1 do
  816. begin
  817. Result[i].TopicIndex := ReadWLC(FDocRootSize) + LastDoc;
  818. LastDoc := Result[i].TopicIndex;
  819. TopicHits := ReadWLC(FCodeCountRootSize);
  820. SetLength(Result[i].LocationCodes, TopicHits);
  821. LastLocCode := 0;
  822. for j := 0 to TopicHits-1 do
  823. begin
  824. Result[i].LocationCodes[j] := ReadWLC(FLocCodeRootSize) + LastLocCode;
  825. LastLocCode := Result[i].LocationCodes[j];
  826. end;
  827. ClearBuffer;
  828. end;
  829. FStream.Position := CachedStreamPos;
  830. end;
  831. constructor TChmSearchReader.Create ( AStream: TStream;
  832. AFreeStreamOnDestroy: Boolean ) ;
  833. begin
  834. FStream := AStream;
  835. FFreeStreamOnDestroy := AFreeStreamOnDestroy;
  836. ReadCommonData;
  837. end;
  838. destructor TChmSearchReader.Destroy;
  839. begin
  840. if FFreeStreamOnDestroy then
  841. FreeAndNil(FStream);
  842. inherited Destroy;
  843. end;
  844. procedure TChmSearchReader.DumpData (
  845. AFoundDataEvent: TChmSearchReaderFoundDataEvent ) ;
  846. var
  847. LastWord: String;
  848. TheWord: String;
  849. InTitle: Boolean;
  850. WLCCount: DWord;
  851. WLCOffset: DWord;
  852. WLCSize: DWord;
  853. FoundHits: TChmWLCTopicArray;
  854. i: Integer;
  855. j: Integer;
  856. begin
  857. MoveToFirstLeafNode;
  858. LastWord := '';
  859. repeat
  860. if (ReadLeafNodeEntry(LastWord, TheWord, InTitle, WLCCount, WLCOffset, WLCSize) = False) then
  861. begin
  862. if FnextLeafNode <> 0 then
  863. begin
  864. MoveToNode(FnextLeafNode, 1);
  865. LastWord := '';
  866. end
  867. else
  868. Break;
  869. end
  870. else begin
  871. LastWord := TheWord;
  872. //WriteLn('Reading Hits for ', TheWord ,' at ', hexstr(WLCOffset,8) );
  873. FoundHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  874. //WriteLn('DONE Reading Hits for ', TheWord);
  875. // AFoundDataEvent(Self, TheWord, 0,0);//FoundHits[i].TopicIndex ,-1);//FoundHits[i].LocationCodes[j]);
  876. for i := 0 to High(FoundHits) do
  877. for j := 0 to High(FoundHits[i].LocationCodes) do
  878. AFoundDataEvent(Self, TheWord, FoundHits[i].TopicIndex ,FoundHits[i].LocationCodes[j]);
  879. end;
  880. until False; //FStream.Position - FActiveNodeStart >= FIFTI_NODE_SIZE - FActiveNodeFreeSpace
  881. end;
  882. function TChmSearchReader.LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray;
  883. var
  884. LastWord: String;
  885. NewWord: String;
  886. NodeLevel: Integer;
  887. NewNodePosition: DWord;
  888. InTitle: Boolean;
  889. WLCCount: DWord;
  890. WLCOffset: DWord;
  891. WLCSize: DWord;
  892. CompareResult: Integer;
  893. ReadNextResult: Boolean;
  894. begin
  895. AWord := LowerCase(AWord);
  896. NodeLevel := FTreeDepth;
  897. MoveToRootNode;
  898. SetLength(Result, 0);
  899. LastWord := '';
  900. // descend the index node tree until we find the leafnode
  901. while NodeLevel > 1 do begin
  902. //WriteLn('At Node Level ', NodeLevel);
  903. if ReadIndexNodeEntry(LastWord, NewWord, NewNodePosition) <> False then
  904. begin
  905. LastWord := NewWord;
  906. //WriteLn('Found Index Entry: ', NewWord, ' Comparing to ', AWord);
  907. if ChmCompareText(NewWord, AWord) >= 0 then
  908. begin
  909. LastWord := '';
  910. Dec(NodeLevel);
  911. MoveToNode(NewNodePosition, NodeLevel);
  912. end;
  913. end
  914. else
  915. Break;
  916. end;
  917. if NodeLevel > 1 then
  918. Exit; // the entry we are looking for is > than the last entry of the last index node
  919. // now we are in a leafnode
  920. while ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize) <> False do
  921. begin
  922. //WriteLn('Found Leaf Entry: ', NewWord, ' Comparing to ', AWord);
  923. LastWord := NewWord;
  924. if Length(NewWord) < Length(AWord) then
  925. continue;
  926. if AStartsWith then //it only has to start with the searched term
  927. CompareResult := ChmCompareText(AWord, Copy(NewWord, 1, Length(AWord)))
  928. else // it must match exactly
  929. CompareResult := ChmCompareText(AWord, NewWord);
  930. if CompareResult < 0 then
  931. Exit;
  932. if CompareResult = 0 then
  933. begin
  934. if InTitle then
  935. ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
  936. else
  937. Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  938. // check if the next entry is the same word since there is an entry for titles and for body
  939. if (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize)) then
  940. ReadNextResult := True
  941. else if (FNextLeafNode <> 0) then
  942. begin
  943. MoveToNode(FNextLeafNode, 1);
  944. LastWord := '';
  945. ReadNextResult := (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize));
  946. end;
  947. if ReadNextResult and (NewWord = AWord) then
  948. begin
  949. if InTitle then
  950. ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
  951. else
  952. Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  953. end;
  954. Exit;
  955. end;
  956. end;
  957. end;
  958. end.