chmfiftimain.pas 30 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108
  1. { Copyright (C) <2008> <Andrew Haines> chmfiftimain.pas
  2. This library is free software; you can redistribute it and/or modify it
  3. under the terms of the GNU Library General Public License as published by
  4. the Free Software Foundation; either version 2 of the License, or (at your
  5. option) any later version.
  6. This program is distributed in the hope that it will be useful, but WITHOUT
  7. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  8. FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
  9. for more details.
  10. You should have received a copy of the GNU Library General Public License
  11. along with this library; if not, write to the Free Software Foundation,
  12. Inc., i51 Franklin Street, Fifth Floor, Boston, MA 02111-1301, USA.
  13. }
  14. {
  15. See the file COPYING.FPC, included in this distribution,
  16. for details about the copyright.
  17. }
  18. {$IFNDEF FPC_DOTTEDUNITS}
  19. unit chmFiftiMain;
  20. {$ENDIF FPC_DOTTEDUNITS}
  21. {$mode objfpc}{$H+}
  22. interface
  23. {$IFDEF FPC_DOTTEDUNITS}
  24. uses System.Classes, Chm.HtmlIndexer;
  25. {$ELSE FPC_DOTTEDUNITS}
  26. uses Classes, HTMLIndexer;
  27. {$ENDIF FPC_DOTTEDUNITS}
  28. type
  29. TFiftiMainHeader = record
  30. Sig: array [0..3] of byte; //$00,$00,$28,$00
  31. HTMLFilesCount: DWord;
  32. RootNodeOffset: DWord;
  33. Unknown1: DWord; // = 0
  34. LeafNodeCount: DWord;
  35. CopyOfRootNodeOffset: DWord;
  36. TreeDepth: Word;
  37. Unknown2: DWord; // = 7
  38. DocIndexScale: Byte;
  39. DocIndexRootSize: Byte;
  40. CodeCountScale: Byte;
  41. CodeCountRootSize: Byte;
  42. LocationCodeScale: Byte;
  43. LocationCodeRootSize: Byte;
  44. Unknown3: array[0..9] of byte; // = 0
  45. NodeSize: DWord; // 4096;
  46. Unknown4: DWord; // 0 or 1;
  47. LastDupWordIndex: DWord;
  48. LastDupCharIndex: DWord;
  49. LongestWordLength: DWord; // maximum 99
  50. TotalWordsIndexed: DWord; // includes duplicates
  51. TotalWords: DWord; // word count not including duplicates
  52. TotalWordsLengthPart1: DWord; // length of all the words with duplicates plus the next dword!
  53. TotalWordsLengthPart2: DWord;
  54. TotalWordsLength: DWord; // length of all words not including duplicates
  55. WordBlockUnusedBytes: DWord; // who knows, this makes no sense when there are more than one blocks
  56. Unknown5: DWord; // 0
  57. HTMLFilesCountMinusOne: DWord; // maybe
  58. Unknown6: array[0..23] of Byte; // 0
  59. WindowsCodePage: DWord; // usually 1252
  60. LocalID: DWord;
  61. //Unknown7: array [0..893] of Byte; // 0
  62. end;
  63. { TFIftiNode }
  64. TFIftiNode = class(TObject)
  65. FLastWord: String;
  66. FWriteStream: TStream;
  67. FBlockStream: TMemoryStream;
  68. ParentNode: TFIftiNode;
  69. OwnsParentNode : boolean;
  70. function AdjustedWord(AWord: String; out AOffset: Byte; AOldWord: String): String;
  71. procedure ChildIsFull(AWord: String; ANodeOffset: DWord); virtual; abstract;
  72. function GuessIfCanHold(AWord: String): Boolean; virtual; abstract;
  73. procedure Flush(NewBlockNeeded: Boolean); virtual; abstract;
  74. procedure FillRemainingSpace;
  75. function RemainingSpace: DWord;
  76. constructor Create(AStream: TStream);
  77. destructor Destroy; override;
  78. end;
  79. { TChmSearchWriter }
  80. TChmSearchWriter = class(TObject)
  81. private
  82. FHeaderRec: TFiftiMainHeader;
  83. FStream: TStream;
  84. FWordList: TIndexedWordList;
  85. FActiveLeafNode: TFIftiNode;
  86. function GetHasData: Boolean;
  87. procedure ProcessWords;
  88. procedure WriteHeader(IsPlaceHolder: Boolean);
  89. procedure WriteAWord(AWord: TIndexedWord);
  90. public
  91. procedure WriteToStream;
  92. property HasData: Boolean read GetHasData;
  93. constructor Create(AStream: TStream; AWordList: TIndexedWordList);
  94. destructor Destroy; override;
  95. end;
  96. { TChmSearchReader }
  97. TChmWLCTopic = record
  98. TopicIndex: DWord;
  99. LocationCodes: array of DWord;
  100. end;
  101. TChmWLCTopicArray = array of TChmWLCTopic;
  102. TChmSearchReader = class;
  103. TChmSearchReaderFoundDataEvent = procedure(Sender: TChmSearchReader; AWord: String; ATopic: DWord; AWordIndex: DWord) of object;
  104. TChmSearchReader = class(TObject)
  105. private
  106. FStream: TStream;
  107. FFileIsValid: Boolean;
  108. FFreeStreamOnDestroy: Boolean;
  109. FDocRootSize,
  110. FCodeCountRootSize,
  111. FLocCodeRootSize: Integer;
  112. FTreeDepth: Integer;
  113. FRootNodeOffset: DWord;
  114. FActiveNodeStart: DWord;
  115. FActiveNodeFreeSpace: Word;
  116. FNextLeafNode: DWord;
  117. procedure ReadCommonData;
  118. procedure MoveToFirstLeafNode;
  119. procedure MoveToRootNode;
  120. procedure MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
  121. function ReadWordOrPartialWord(ALastWord: String): String; // returns the whole word using the last word as a base
  122. function ReadIndexNodeEntry(ALastWord: String; out AWord: String; out ASubNodeStart: DWord): Boolean;
  123. function ReadLeafNodeEntry(ALastWord: String; out AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out AWLCOffset: DWord; out AWLCSize: DWord): Boolean;
  124. function ReadWLCEntries(AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord): TChmWLCTopicArray;
  125. public
  126. constructor Create(AStream: TStream; AFreeStreamOnDestroy: Boolean);
  127. destructor Destroy; override;
  128. procedure DumpData(AFoundDataEvent: TChmSearchReaderFoundDataEvent);
  129. function LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray;
  130. property FileIsValid: Boolean read FFileIsValid;
  131. end;
  132. const
  133. FIFTI_NODE_SIZE = 4096;
  134. implementation
  135. {$IFDEF FPC_DOTTEDUNITS}
  136. uses System.SysUtils, System.Math, Chm.Base;
  137. {$ELSE FPC_DOTTEDUNITS}
  138. uses SysUtils, Math, ChmBase;
  139. {$ENDIF FPC_DOTTEDUNITS}
  140. type
  141. { TIndexNode }
  142. TIndexNode = class(TFIftiNode)
  143. function GuessIfCanHold(AWord: String): Boolean; override;
  144. procedure ChildIsFull ( AWord: String; ANodeOffset: DWord ); override;
  145. procedure Flush(NewBlockNeeded: Boolean); override;
  146. end;
  147. { TLeafNode }
  148. TLeafNode = class(TFIftiNode)
  149. FLeafNodeCount: DWord;
  150. FLastNodeStart: DWord;
  151. FreeSpace: DWord;
  152. FDocRootSize,
  153. FCodeRootSize,
  154. FLocRootSize: Byte;
  155. procedure WriteInitialHeader;
  156. Destructor Destroy; override;
  157. function GuessIfCanHold(AWord: String): Boolean; override;
  158. procedure Flush(NewBlockNeeded: Boolean); override;
  159. procedure AddWord(AWord: TIndexedWord);
  160. function WriteWLCEntries(AWord: TIndexedWord; ADocRootSize, ACodeRootSize, ALocRootSize: Byte): DWord;
  161. property LeafNodeCount: DWord read FLeafNodeCount;
  162. property DocRootSize: Byte read FDocRootSize write FDocRootSize;
  163. property CodeRootSize: Byte read FCodeRootSize write FCodeRootSize;
  164. property LocRootSize: Byte read FLocRootSize write FLocRootSize;
  165. end;
  166. function GetCompressedIntegerBE(Stream: TStream): DWord;
  167. var
  168. Buf: Byte;
  169. Value: Dword = 0;
  170. Shift: Integer = 0;
  171. begin
  172. repeat
  173. Buf := Stream.ReadByte;
  174. Value := Value or (Buf and $7F) shl Shift;
  175. Inc(Shift, 7);
  176. until (Buf and $80) = 0;
  177. Result := Value;
  178. end;
  179. procedure WriteCompressedIntegerBE(Stream: TStream; AInt: DWord);
  180. var
  181. Bits: Integer;
  182. Tmp: DWord;
  183. Buf: Byte;
  184. begin
  185. Tmp := AInt;
  186. Bits := 0;
  187. while Tmp <> 0 do
  188. begin
  189. Tmp := Tmp shr 1;
  190. Inc(Bits);
  191. end;
  192. repeat
  193. Buf := (AInt shr (Tmp * 7)) and $7F;
  194. if Bits > 7 then
  195. Buf := Buf or $80;
  196. Dec(Bits, 7);
  197. Inc(Tmp);
  198. Stream.WriteByte(Buf);
  199. until Bits <= 0;
  200. end;
  201. function WriteScaleRootInt(ANumber: DWord; out Bits: DWord; Root: Integer): Byte;
  202. var
  203. Tmp: DWord;
  204. Mask: DWord;
  205. // Scale: Integer;
  206. NeededBits: Integer;
  207. PrefixBits: Integer;
  208. RootBits: Integer;
  209. begin
  210. // Scale := 2;
  211. Bits := 0;
  212. Result := Root;
  213. Tmp := ANumber;
  214. NeededBits := 0;
  215. while Tmp <> 0 do
  216. begin
  217. Inc(NeededBits);
  218. Tmp := Tmp shr 1;
  219. end;
  220. PrefixBits := Max(0, NeededBits-Root);
  221. RootBits := NeededBits -1; //
  222. if RootBits < Root then
  223. RootBits := Root;
  224. if RootBits < 0 then
  225. RootBits := 0;
  226. Mask := 0;
  227. if RootBits-1 >= 0 then
  228. for Tmp := 0 to RootBits-1 do
  229. Mask := Mask or (DWord(1) shl Tmp);
  230. Bits := not Mask;
  231. Bits := Bits shl 1; // make space for empty bit
  232. Bits := Bits or (ANumber and Mask);
  233. Result := PrefixBits + 1 + RootBits;
  234. Bits := (Bits shl (32-Result)) shr (32 - Result);
  235. end;
  236. { TChmSearchWriter }
  237. procedure TChmSearchWriter.ProcessWords;
  238. begin
  239. FWordList.ForEach(@WriteAword);
  240. if FActiveLeafNode <> nil then
  241. FActiveLeafNode.Flush(False); // causes the unwritten parts of the tree to be written
  242. end;
  243. function TChmSearchWriter.GetHasData: Boolean;
  244. begin
  245. Result := FWordList.IndexedFileCount > 0;
  246. end;
  247. procedure TChmSearchWriter.WriteHeader ( IsPlaceHolder: Boolean ) ;
  248. var
  249. TmpNode: TFIftiNode;
  250. i: Integer;
  251. begin
  252. if IsPlaceHolder then
  253. begin
  254. FStream.Size := $400; // the header size. we will fill this after the nodes have been determined
  255. FStream.Position := $400;
  256. FillChar(PAnsiChar(TMemoryStream(FStream).Memory)^, $400, 0);
  257. FHeaderRec.DocIndexRootSize := 1;
  258. FHeaderRec.CodeCountRootSize := 1;
  259. FHeaderRec.LocationCodeRootSize := 4;
  260. Exit;
  261. end;
  262. // write the glorious header
  263. FHeaderRec.Sig[2] := $28;
  264. FHeaderRec.HTMLFilesCount := FWordList.IndexedFileCount;
  265. FHeaderRec.RootNodeOffset := FStream.Size-FIFTI_NODE_SIZE;
  266. FHeaderRec.LeafNodeCount := TLeafNode(FActiveLeafNode).LeafNodeCount;
  267. FHeaderRec.CopyOfRootNodeOffset := FHeaderRec.RootNodeOffset;
  268. FHeaderRec.TreeDepth := 0;
  269. TmpNode := FActiveLeafNode;
  270. while TmpNode <> nil do
  271. begin
  272. Inc(FHeaderRec.TreeDepth);
  273. TmpNode := TmpNode.ParentNode;
  274. end;
  275. FHeaderRec.DocIndexScale := 2;
  276. FHeaderRec.CodeCountScale := 2;
  277. FHeaderRec.LocationCodeScale := 2;
  278. //FHeaderRec.DocIndexRootSize := 15;
  279. //FHeaderRec.CodeCountRootSize := 15;
  280. //FHeaderRec.LocationCodeRootSize := 15;
  281. FHeaderRec.NodeSize := FIFTI_NODE_SIZE;
  282. FHeaderRec.LongestWordLength := FWordList.LongestWord;
  283. FHeaderRec.TotalWordsIndexed := FWordList.TotalWordCount;
  284. FHeaderRec.TotalWords := FWordList.TotalDIfferentWords;
  285. FHeaderRec.TotalWordsLengthPart1 := FWordList.TotalWordLength;
  286. FHeaderRec.TotalWordsLength := FWordList.TotalDifferentWordLength;
  287. FHeaderRec.WindowsCodePage := 1252;
  288. FStream.Position := 0;
  289. FStream.Write(FHeaderRec.Sig[0], 4);
  290. FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount));
  291. FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset));
  292. FStream.WriteDWord(NtoLE(0)); // unknown 1
  293. FStream.WriteDWord(NtoLE(FHeaderRec.LeafNodeCount));
  294. FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset)); // yes twice
  295. FStream.WriteWord(NtoLE(FHeaderRec.TreeDepth));
  296. FStream.WriteDWord(NtoLE(DWord(7)));
  297. FStream.WriteByte(2);
  298. FStream.WriteByte(FHeaderRec.DocIndexRootSize);
  299. FStream.WriteByte(2);
  300. FStream.WriteByte(FHeaderRec.CodeCountRootSize);
  301. FStream.WriteByte(2);
  302. FStream.WriteByte(FHeaderRec.LocationCodeRootSize);
  303. // eat 10 bytes
  304. FStream.WriteWord(0);
  305. FStream.WriteDWord(0);
  306. FStream.WriteDWord(0);
  307. FStream.WriteDWord(NtoLE(FHeaderRec.NodeSize));
  308. FStream.WriteDWord(NtoLE(DWord(0)));
  309. FStream.WriteDWord(1);
  310. FStream.WriteDWord(5);
  311. FStream.WriteDWord(NtoLE(FHeaderRec.LongestWordLength));
  312. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsIndexed));
  313. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWords));
  314. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart1));
  315. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart2));
  316. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLength));
  317. FStream.WriteDWord(NtoLE(TLeafNode(FActiveLeafNode).FreeSpace));
  318. FStream.WriteDWord(NtoLE(0));
  319. FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount-1));
  320. for i := 0 to 23 do FStream.WriteByte(0);
  321. FStream.WriteDWord(NtoLE(FHeaderRec.WindowsCodePage));
  322. FStream.WriteDWord(NtoLE(DWord(1033))); // LCID
  323. for i := 0 to 893 do FStream.WriteByte(0);
  324. end;
  325. procedure TChmSearchWriter.WriteAWord ( AWord: TIndexedWord ) ;
  326. begin
  327. if FActiveLeafNode = nil then
  328. begin
  329. FActiveLeafNode := TLeafNode.Create(FStream);
  330. with TLeafNode(FActiveLeafNode) do
  331. begin
  332. DocRootSize := FHeaderRec.DocIndexRootSize;
  333. CodeRootSize := FHeaderRec.CodeCountRootSize;
  334. LocRootSize := FHeaderRec.LocationCodeRootSize;
  335. end;
  336. end;
  337. if FActiveLeafNode.GuessIfCanHold(AWord.TheWord) = False then
  338. begin
  339. FActiveLeafNode.Flush(True);
  340. end;
  341. TLeafNode(FActiveLeafNode).AddWord(AWord);
  342. end;
  343. procedure TChmSearchWriter.WriteToStream;
  344. begin
  345. WriteHeader(True);
  346. ProcessWords;
  347. WriteHeader(False);
  348. end;
  349. constructor TChmSearchWriter.Create ( AStream: TStream;
  350. AWordList: TIndexedWordList ) ;
  351. begin
  352. FStream := AStream;
  353. FWordList := AWordList;
  354. FActiveLeafNode:=NIL;
  355. end;
  356. destructor TChmSearchWriter.Destroy;
  357. begin
  358. freeandnil(FActiveLeafNode);
  359. inherited;
  360. end;
  361. { TLeafNode }
  362. function TFIftiNode.RemainingSpace: DWord;
  363. begin
  364. Result := FIFTI_NODE_SIZE - FBlockStream.Position;
  365. end;
  366. constructor TFIftiNode.Create ( AStream: TStream ) ;
  367. begin
  368. inherited Create;
  369. FWriteStream := AStream;
  370. FBlockStream := TMemoryStream.Create;
  371. OwnsParentNode :=false;
  372. end;
  373. destructor TFIftiNode.Destroy;
  374. begin
  375. FBlockStream.Free;
  376. if OwnsParentNode then ParentNode.Free;
  377. inherited Destroy;
  378. end;
  379. procedure TFIftiNode.FillRemainingSpace;
  380. begin
  381. while RemainingSpace > 0 do
  382. FBlockStream.WriteByte(0);
  383. end;
  384. function TFIftiNode.AdjustedWord ( AWord: String; out AOffset: Byte; AOldWord: String ) : String;
  385. var
  386. Count1,
  387. Count2: Integer;
  388. Count: Integer;
  389. i: Integer;
  390. begin
  391. if AWord = AOldWord then
  392. begin
  393. AOffset := Length(AWord);
  394. Exit('');
  395. end;
  396. // else
  397. Count1 := Length(AOldWord);
  398. Count2 := Length(AWord);
  399. if Count1<Count2 then
  400. Count := Count1
  401. else
  402. Count := Count2;
  403. for i := 1 to Count do
  404. begin
  405. AOffset := i-1;
  406. if AOldWord[i] <> AWord[i]
  407. then Exit(Copy(AWord, i, Length(AWord)));
  408. end;
  409. Result := AWord;
  410. AOffset := 0;
  411. end;
  412. procedure TLeafNode.WriteInitialHeader;
  413. begin
  414. FBlockStream.WriteDWord(0);
  415. FBlockStream.WriteWord(0);
  416. FBlockStream.WriteWord(0);
  417. end;
  418. destructor TLeafNode.Destroy;
  419. begin
  420. inherited Destroy;
  421. end;
  422. function TLeafNode.GuessIfCanHold ( AWord: String ) : Boolean;
  423. var
  424. WordOffset: Byte;
  425. begin
  426. Result := 17 + Length(AdjustedWord(AWord, WordOffset, FLastWord)) < RemainingSpace;
  427. end;
  428. procedure TLeafNode.Flush(NewBlockNeeded: Boolean);
  429. var
  430. FTmpPos: DWord;
  431. begin
  432. Inc(FLeafNodeCount);
  433. FTmpPos := FWriteStream.Position;
  434. // update the previous leaf node about our position.
  435. if FLastNodeStart > 0 then
  436. begin
  437. FWriteStream.Position := FLastNodeStart;
  438. FWriteStream.WriteDWord(NtoLE(FTmpPos));
  439. FWriteStream.Position := FTmpPos;
  440. end;
  441. FLastNodeStart := FTmpPos;
  442. FreeSpace := RemainingSpace;
  443. FillRemainingSpace;
  444. // update the leaf header to show the available space.
  445. FBlockStream.Position := 6;
  446. FBlockStream.WriteWord(NtoLE(Word(FreeSpace)));
  447. // copy the leaf block to the fiftimain file
  448. FBlockStream.Position := 0;
  449. FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
  450. FBlockStream.Position := 0;
  451. if NewBlockNeeded or ((NewBlockNeeded = False) and (ParentNode <> nil)) then
  452. begin
  453. if ParentNode = nil then
  454. begin
  455. ParentNode := TIndexNode.Create(FWriteStream);
  456. OwnsParentNode:=True;
  457. end;
  458. ParentNode.ChildIsFull(FLastWord, FLastNodeStart);
  459. if (NewBlockNeeded = False) then
  460. ParentNode.Flush(False);
  461. end;
  462. FLastWord := '';
  463. end;
  464. procedure TLeafNode.AddWord ( AWord: TIndexedWord ) ;
  465. var
  466. Offset: Byte;
  467. NewWord: String;
  468. WLCSize: DWord;
  469. begin
  470. if Length(AWord.TheWord) > 99 then
  471. Exit; // Maximum word length is 99
  472. if FBlockStream.Position = 0 then
  473. WriteInitialHeader;
  474. NewWord := AdjustedWord(AWord.TheWord, Offset, FLastWord);
  475. FLastWord := AWord.TheWord;
  476. FBlockStream.WriteByte(Length(NewWord)+1);
  477. FBlockStream.WriteByte(Offset);
  478. // length can be 0 if it is the same word as the last. there is a word entry each for title and content
  479. if Length(NewWord) > 0 then
  480. FBlockStream.Write(NewWord[1], Length(NewWord));
  481. FBlockStream.WriteByte(Ord(AWord.IsTitle));
  482. WriteCompressedIntegerBE(FBlockStream, AWord.DocumentCount);
  483. FBlockStream.WriteDWord(NtoLE(DWord(FWriteStream.Position)));
  484. FBlockStream.WriteWord(0);
  485. // write WLC to FWriteStream so we can write the size of the wlc entries
  486. WLCSize := WriteWLCEntries(AWord, FDocRootSize, FCodeRootSize, FLocRootSize);
  487. WriteCompressedIntegerBE(FBlockStream, WLCSize);
  488. if FBlockStream.Position > FIFTI_NODE_SIZE then
  489. raise Exception.Create('FIFTIMAIN Leaf node has written past the block!');
  490. end;
  491. function Min(AValue, BValue: Byte): Byte;
  492. begin
  493. if AValue < BValue then
  494. Result := AValue
  495. else Result := BValue;
  496. end;
  497. function Max(AValue, BValue: Byte): Byte;
  498. begin
  499. if AValue > BValue then
  500. Result := AValue
  501. else Result := BValue;
  502. end;
  503. function Max(AValue, BValue: Integer): Integer;
  504. begin
  505. if AValue > BValue then
  506. Result := AValue
  507. else Result := BValue;
  508. end;
  509. function Max(AValue, BValue: DWord): DWord;
  510. begin
  511. if AValue > BValue then
  512. Result := AValue
  513. else Result := BValue;
  514. end;
  515. function TLeafNode.WriteWLCEntries ( AWord: TIndexedWord ; ADocRootSize, ACodeRootSize, ALocRootSize: Byte) : DWord;
  516. var
  517. LastDocIndex: DWord;
  518. LastLocCode: DWord;
  519. UsedBits: Byte;
  520. Buf: Byte;
  521. function NewDocDelta(ADocIndex: DWord): DWord;
  522. begin
  523. Result := ADocIndex - LastDocIndex;
  524. LastDocIndex := ADocIndex;
  525. end;
  526. function NewLocCode(ALocCode: DWord): DWord;
  527. begin
  528. Result := ALocCode - LastLocCode;
  529. LastLocCode := ALocCode;
  530. end;
  531. procedure AddValue(AValue: DWord; BitCount: Byte);
  532. var
  533. NeededBits: Byte;
  534. Tmp: Byte;
  535. begin
  536. AValue := AValue shl (32 - BitCount);
  537. while BitCount > 0 do
  538. begin
  539. NeededBits := 8 - UsedBits;
  540. Tmp := Hi(Hi(DWord(AValue shr (UsedBits))));
  541. Buf := Buf or Tmp;
  542. Inc(UsedBits, Min(BitCount, NeededBits));
  543. AValue := AValue shl Min(BitCount, NeededBits);
  544. Dec(BitCount, Min(BitCount, NeededBits));
  545. if (UsedBits = 8) then
  546. begin
  547. FWriteStream.WriteByte(Buf);
  548. UsedBits := 0;
  549. NeededBits := 0;
  550. Buf := 0;
  551. end;
  552. end;
  553. end;
  554. procedure FlushBuffer;
  555. begin
  556. if UsedBits > 0 then
  557. FWriteStream.WriteByte(Buf);
  558. UsedBits := 0;
  559. Buf := 0;
  560. end;
  561. var
  562. DocDelta: DWord;
  563. LocDelta: DWord;
  564. StartPos: DWord;
  565. Bits: DWord;
  566. BitCount: Byte;
  567. i,
  568. j: Integer;
  569. Doc: TIndexDocument;
  570. // proced
  571. begin
  572. StartPos := FWriteStream.Position;
  573. LastDocIndex := 0;
  574. UsedBits := 0;
  575. Buf := 0;
  576. for i := 0 to AWord.DocumentCount-1 do
  577. begin
  578. LastLocCode := 0;
  579. Doc := AWord.GetLogicalDocument(i);
  580. DocDelta := NewDocDelta(Doc.DocumentIndex);
  581. BitCount := WriteScaleRootInt(DocDelta, Bits, ADocRootSize);
  582. AddValue(Bits, BitCount);
  583. BitCount := WriteScaleRootInt(Doc.NumberOfIndexEntries, Bits, ACodeRootSize);
  584. AddValue(Bits, BitCount);
  585. for j := 0 to Doc.NumberOfIndexEntries-1 do
  586. begin
  587. LocDelta := NewLocCode(Doc.IndexEntry[j]);
  588. BitCount := WriteScaleRootInt(LocDelta, Bits, ALocRootSize);
  589. AddValue(Bits, BitCount);
  590. end;
  591. FlushBuffer;
  592. end;
  593. Result := FWriteStream.Position-StartPos;
  594. end;
  595. { TIndexNode }
  596. function TIndexNode.GuessIfCanHold ( AWord: String ) : Boolean;
  597. var
  598. Offset: Byte;
  599. begin
  600. Result := FBlockStream.Position + 8 + Length(AdjustedWord(AWord, Offset, FLastWord)) < FIFTI_NODE_SIZE;
  601. end;
  602. procedure TIndexNode.ChildIsFull ( AWord: String; ANodeOffset: DWord ) ;
  603. var
  604. Offset: Byte;
  605. NewWord: String;
  606. begin
  607. if FBlockStream.Position = 0 then
  608. FBlockStream.WriteWord(0); // free space at end. updated when the block is flushed
  609. if GuessIfCanHold(AWord) = False then
  610. Flush(True);
  611. NewWord := AdjustedWord(AWord, Offset, FLastWord);
  612. FLastWord:=AWord;
  613. // Write the Index node Entry
  614. FBlockStream.WriteByte(Length(NewWord)+1);
  615. FBlockStream.WriteByte(Offset);
  616. FBlockStream.Write(NewWord[1], Length(NewWord));
  617. FBlockStream.WriteDWord(NtoLE(ANodeOffset));
  618. FBlockStream.WriteWord(0);
  619. if FBlockStream.Position > FIFTI_NODE_SIZE then
  620. raise Exception.Create('FIFTIMAIN Index node has written past the block!');
  621. end;
  622. procedure TIndexNode.Flush ( NewBlockNeeded: Boolean ) ;
  623. var
  624. RemSize: DWord;
  625. begin
  626. if NewBlockNeeded then
  627. begin
  628. if ParentNode = nil then
  629. begin
  630. ParentNode := TIndexNode.Create(FWriteStream);
  631. OwnsParentNode:=True;
  632. end;
  633. end;
  634. if ParentNode <> nil then
  635. ParentNode.ChildIsFull(FLastWord, FWriteStream.Position);
  636. RemSize := RemainingSpace;
  637. FillRemainingSpace;
  638. FBlockStream.Position := 0;
  639. FBlockStream.WriteWord(NtoLE(RemSize));
  640. FBlockStream.Position := 0;
  641. FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
  642. FBlockStream.Position := 0;
  643. FLastWord := '';
  644. if NewBlockNeeded then
  645. FBlockStream.WriteDWord(0) // placeholder to write free space in when block is full
  646. else
  647. if ParentNode <> nil then
  648. ParentNode.Flush(NewBlockNeeded);
  649. end;
  650. { TChmSearchReader }
  651. procedure TChmSearchReader.ReadCommonData;
  652. var
  653. Sig: DWord;
  654. begin
  655. FStream.Position := 0;
  656. Sig := LEtoN(FStream.ReadDWord);
  657. FFileIsValid := Sig = $00280000;
  658. if not FileIsValid then
  659. Exit;
  660. // root node address
  661. FStream.Position := $8;
  662. FRootNodeOffset := LEtoN(FStream.ReadDWord);
  663. // Tree Depth
  664. FStream.Position := $18;
  665. FTreeDepth := LEtoN(FStream.ReadWord);
  666. // Root sizes for scale and root integers
  667. FStream.Position := $1E;
  668. if FStream.ReadByte <> 2 then // we only can read the files when scale is 2
  669. FFileIsValid := False;
  670. FDocRootSize := FStream.ReadByte;
  671. if FStream.ReadByte <> 2 then
  672. FFileIsValid := False;
  673. FCodeCountRootSize := FStream.ReadByte;
  674. if FStream.ReadByte <> 2 then
  675. FFileIsValid := False;
  676. FLocCodeRootSize := FStream.ReadByte;
  677. end;
  678. procedure TChmSearchReader.MoveToFirstLeafNode;
  679. var
  680. NodeDepth: Integer;
  681. NodeOffset: DWord;
  682. LastWord: String;
  683. NewWord: String;
  684. begin
  685. NodeDepth := FTreeDepth;
  686. MoveToRootNode;
  687. while NodeDepth > 1 do
  688. begin
  689. LastWord := '';
  690. ReadIndexNodeEntry(LastWord, NewWord, NodeOffset);
  691. Dec(NodeDepth);
  692. MoveToNode(NodeOffset, NodeDepth);
  693. end;
  694. end;
  695. procedure TChmSearchReader.MoveToRootNode;
  696. begin
  697. MoveToNode(FRootNodeOffset, FTreeDepth);
  698. end;
  699. procedure TChmSearchReader.MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
  700. begin
  701. FStream.Position := ANodeOffset;
  702. FActiveNodeStart := FStream.Position;
  703. if ANodeDepth > 1 then
  704. begin
  705. FnextLeafNode := 0;
  706. FActiveNodeFreeSpace := LEtoN(FStream.ReadWord); // empty space at end of node
  707. end
  708. else
  709. begin
  710. FnextLeafNode := LEtoN(FStream.ReadDWord);
  711. FStream.ReadWord;
  712. FActiveNodeFreeSpace := LEtoN(FStream.ReadWord);
  713. end;
  714. end;
  715. function TChmSearchReader.ReadWordOrPartialWord ( ALastWord: String ) : String;
  716. var
  717. WordLength: Integer;
  718. CopyLastWordCharCount: Integer;
  719. begin
  720. WordLength := FStream.ReadByte;
  721. CopyLastWordCharCount := FStream.ReadByte;
  722. if CopyLastWordCharCount > 0 then
  723. Result := Copy(ALastWord, 1, CopyLastWordCharCount);
  724. SetLength(Result, (WordLength-1) + CopyLastWordCharCount);
  725. if WordLength > 1 then
  726. FStream.Read(Result[1+CopyLastWordCharCount], WordLength-1);
  727. end;
  728. function TChmSearchReader.ReadIndexNodeEntry (ALastWord: String; out AWord: String; out
  729. ASubNodeStart: DWord ): Boolean;
  730. begin
  731. Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  732. if not Result then
  733. Exit;
  734. AWord := ReadWordOrPartialWord(ALastWord);
  735. ASubNodeStart := LEtoN(FStream.ReadDWord);
  736. FStream.ReadWord;
  737. end;
  738. function TChmSearchReader.ReadLeafNodeEntry ( ALastWord: String; out
  739. AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out
  740. AWLCOffset: DWord; out AWLCSize: DWord ): Boolean;
  741. begin
  742. Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  743. if not Result then
  744. Exit;
  745. AWord := ReadWordOrPartialWord(ALastWord);
  746. AInTitle := FStream.ReadByte = 1;
  747. AWLCCount := GetCompressedIntegerBE(FStream);
  748. AWLCOffset := LEtoN(FStream.ReadDWord);
  749. FStream.ReadWord;
  750. AWLCSize := GetCompressedIntegerBE(FStream);
  751. end;
  752. function TChmSearchReader.ReadWLCEntries (AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord ) : TChmWLCTopicArray;
  753. var
  754. Buf: Byte;
  755. BitsInBuffer: Integer;
  756. FinalPosition: int64;
  757. function GetNextByte(): Boolean;
  758. begin
  759. Result := (FStream.Position < FinalPosition);
  760. if Result then
  761. begin
  762. Buf := FStream.ReadByte;
  763. Inc(BitsInBuffer, 8);
  764. end;
  765. end;
  766. function ShiftBuffer: Boolean;
  767. begin
  768. Buf := (Buf and $7F) shl 1;
  769. Dec(BitsInBuffer);
  770. Result := (BitsInBuffer > 0) or GetNextByte();
  771. end;
  772. function ReadWLC(RootSize: DWord): DWord;
  773. var
  774. PrefixBits: Integer = 0;
  775. RemainingBits: Integer; // only the bits for this number not the bits in buffer
  776. begin
  777. if (BitsInBuffer = 0) then
  778. GetNextByte();
  779. Result := (Buf and $80) shr 7;
  780. while (Buf and $80) > 0 do // find out how many prefix bits there are
  781. begin
  782. Inc(PrefixBits);
  783. if not ShiftBuffer then
  784. Exit;
  785. end;
  786. // skip divider (zero) bit
  787. if not ShiftBuffer then
  788. Exit;
  789. Remainingbits := RootSize + Max(Integer(PrefixBits-1), 0);
  790. while RemainingBits > 0 do
  791. begin
  792. Result := Result shl 1;
  793. Result := Result or (Buf shr 7);
  794. Dec(RemainingBits);
  795. if not ShiftBuffer then
  796. Exit;
  797. end;
  798. end;
  799. procedure ClearBuffer;
  800. begin
  801. if BitsInBuffer < 8 then
  802. begin
  803. BitsInBuffer := 0;
  804. Buf := 0;
  805. end;
  806. end;
  807. var
  808. TopicHits: DWord;
  809. i: Integer;
  810. j: Integer;
  811. CachedStreamPos: QWord;
  812. LastDoc,
  813. LastLocCode: DWord;
  814. begin
  815. FinalPosition := AWLCOffset + AWLCSize;
  816. CachedStreamPos := FStream.Position;
  817. FStream.Position := AWLCOffset;
  818. {for i := 0 to AWLCSize-1 do
  819. begin
  820. Buf := FStream.ReadByte;
  821. Write(binStr(Buf, 8), ' ');
  822. end;}
  823. FStream.Position := AWLCOffset;
  824. SetLength(Result, AWLCCount);
  825. Buf := 0;
  826. BitsInBuffer := 0;
  827. LastDoc := 0;
  828. for i := 0 to AWLCCount-1 do
  829. begin
  830. Result[i].TopicIndex := ReadWLC(FDocRootSize) + LastDoc;
  831. LastDoc := Result[i].TopicIndex;
  832. TopicHits := ReadWLC(FCodeCountRootSize);
  833. SetLength(Result[i].LocationCodes, TopicHits);
  834. LastLocCode := 0;
  835. for j := 0 to TopicHits-1 do
  836. begin
  837. Result[i].LocationCodes[j] := ReadWLC(FLocCodeRootSize) + LastLocCode;
  838. LastLocCode := Result[i].LocationCodes[j];
  839. end;
  840. ClearBuffer;
  841. end;
  842. FStream.Position := CachedStreamPos;
  843. end;
  844. constructor TChmSearchReader.Create ( AStream: TStream;
  845. AFreeStreamOnDestroy: Boolean ) ;
  846. begin
  847. FStream := AStream;
  848. FFreeStreamOnDestroy := AFreeStreamOnDestroy;
  849. ReadCommonData;
  850. end;
  851. destructor TChmSearchReader.Destroy;
  852. begin
  853. if FFreeStreamOnDestroy then
  854. FreeAndNil(FStream);
  855. inherited Destroy;
  856. end;
  857. procedure TChmSearchReader.DumpData (
  858. AFoundDataEvent: TChmSearchReaderFoundDataEvent ) ;
  859. var
  860. LastWord: String;
  861. TheWord: String;
  862. InTitle: Boolean;
  863. WLCCount: DWord;
  864. WLCOffset: DWord;
  865. WLCSize: DWord;
  866. FoundHits: TChmWLCTopicArray;
  867. i: Integer;
  868. j: Integer;
  869. begin
  870. MoveToFirstLeafNode;
  871. LastWord := '';
  872. repeat
  873. if (ReadLeafNodeEntry(LastWord, TheWord, InTitle, WLCCount, WLCOffset, WLCSize) = False) then
  874. begin
  875. if FnextLeafNode <> 0 then
  876. begin
  877. MoveToNode(FnextLeafNode, 1);
  878. LastWord := '';
  879. end
  880. else
  881. Break;
  882. end
  883. else begin
  884. LastWord := TheWord;
  885. //WriteLn('Reading Hits for ', TheWord ,' at ', hexstr(WLCOffset,8) );
  886. FoundHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  887. //WriteLn('DONE Reading Hits for ', TheWord);
  888. // AFoundDataEvent(Self, TheWord, 0,0);//FoundHits[i].TopicIndex ,-1);//FoundHits[i].LocationCodes[j]);
  889. for i := 0 to High(FoundHits) do
  890. for j := 0 to High(FoundHits[i].LocationCodes) do
  891. AFoundDataEvent(Self, TheWord, FoundHits[i].TopicIndex ,FoundHits[i].LocationCodes[j]);
  892. end;
  893. until False; //FStream.Position - FActiveNodeStart >= FIFTI_NODE_SIZE - FActiveNodeFreeSpace
  894. end;
  895. function TChmSearchReader.LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray; AStartsWith: Boolean = True): TChmWLCTopicArray;
  896. var
  897. LastWord: String;
  898. NewWord: String;
  899. NodeLevel: Integer;
  900. NewNodePosition: DWord;
  901. InTitle: Boolean;
  902. WLCCount: DWord;
  903. WLCOffset: DWord;
  904. WLCSize: DWord;
  905. CompareResult: Integer;
  906. ReadNextResult: Boolean;
  907. begin
  908. AWord := LowerCase(AWord);
  909. NodeLevel := FTreeDepth;
  910. MoveToRootNode;
  911. SetLength(Result, 0);
  912. LastWord := '';
  913. // descend the index node tree until we find the leafnode
  914. while NodeLevel > 1 do begin
  915. //WriteLn('At Node Level ', NodeLevel);
  916. if ReadIndexNodeEntry(LastWord, NewWord, NewNodePosition) <> False then
  917. begin
  918. LastWord := NewWord;
  919. //WriteLn('Found Index Entry: ', NewWord, ' Comparing to ', AWord);
  920. if ChmCompareText(NewWord, AWord) >= 0 then
  921. begin
  922. LastWord := '';
  923. Dec(NodeLevel);
  924. MoveToNode(NewNodePosition, NodeLevel);
  925. end;
  926. end
  927. else
  928. Break;
  929. end;
  930. if NodeLevel > 1 then
  931. Exit; // the entry we are looking for is > than the last entry of the last index node
  932. // now we are in a leafnode
  933. while ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize) <> False do
  934. begin
  935. //WriteLn('Found Leaf Entry: ', NewWord, ' Comparing to ', AWord);
  936. LastWord := NewWord;
  937. if Length(NewWord) < Length(AWord) then
  938. continue;
  939. if AStartsWith then //it only has to start with the searched term
  940. CompareResult := ChmCompareText(AWord, Copy(NewWord, 1, Length(AWord)))
  941. else // it must match exactly
  942. CompareResult := ChmCompareText(AWord, NewWord);
  943. if CompareResult < 0 then
  944. Exit;
  945. if CompareResult = 0 then
  946. begin
  947. if InTitle then
  948. ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
  949. else
  950. Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  951. // check if the next entry is the same word since there is an entry for titles and for body
  952. if (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize)) then
  953. ReadNextResult := True
  954. else if (FNextLeafNode <> 0) then
  955. begin
  956. MoveToNode(FNextLeafNode, 1);
  957. LastWord := '';
  958. ReadNextResult := (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize));
  959. end;
  960. if ReadNextResult and (NewWord = AWord) then
  961. begin
  962. if InTitle then
  963. ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
  964. else
  965. Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  966. end;
  967. Exit;
  968. end;
  969. end;
  970. end;
  971. end.