chmfiftimain.pas 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067
  1. { Copyright (C) <2008> <Andrew Haines> chmfiftimain.pas
  2. This library is free software; you can redistribute it and/or modify it
  3. under the terms of the GNU Library General Public License as published by
  4. the Free Software Foundation; either version 2 of the License, or (at your
  5. option) any later version.
  6. This program is distributed in the hope that it will be useful, but WITHOUT
  7. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  8. FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
  9. for more details.
  10. You should have received a copy of the GNU Library General Public License
  11. along with this library; if not, write to the Free Software Foundation,
  12. Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  13. }
  14. {
  15. See the file COPYING.FPC, included in this distribution,
  16. for details about the copyright.
  17. }
  18. unit chmFiftiMain;
  19. {$mode objfpc}{$H+}
  20. interface
  21. uses Classes, HTMLIndexer;
  22. type
  23. TFiftiMainHeader = record
  24. Sig: array [0..3] of byte; //$00,$00,$28,$00
  25. HTMLFilesCount: DWord;
  26. RootNodeOffset: DWord;
  27. Unknown1: DWord; // = 0
  28. LeafNodeCount: DWord;
  29. CopyOfRootNodeOffset: DWord;
  30. TreeDepth: Word;
  31. Unknown2: DWord; // = 7
  32. DocIndexScale: Byte;
  33. DocIndexRootSize: Byte;
  34. CodeCountScale: Byte;
  35. CodeCountRootSize: Byte;
  36. LocationCodeScale: Byte;
  37. LocationCodeRootSize: Byte;
  38. Unknown3: array[0..9] of byte; // = 0
  39. NodeSize: DWord; // 4096;
  40. Unknown4: DWord; // 0 or 1;
  41. LastDupWordIndex: DWord;
  42. LastDupCharIndex: DWord;
  43. LongestWordLength: DWord; // maximum 99
  44. TotalWordsIndexed: DWord; // includes duplicates
  45. TotalWords: DWord; // word count not including duplicates
  46. TotalWordsLengthPart1: DWord; // length of all the words with duplicates plus the next dword!
  47. TotalWordsLengthPart2: DWord;
  48. TotalWordsLength: DWord; // length of all words not including duplicates
  49. WordBlockUnusedBytes: DWord; // who knows, this makes no sense when there are more than one blocks
  50. Unknown5: DWord; // 0
  51. HTMLFilesCountMinusOne: DWord; // maybe
  52. Unknown6: array[0..23] of Byte; // 0
  53. WindowsCodePage: DWord; // usually 1252
  54. LocalID: DWord;
  55. //Unknown7: array [0..893] of Byte; // 0
  56. end;
  57. { TFIftiNode }
  58. TFIftiNode = class(TObject)
  59. FLastWord: String;
  60. FWriteStream: TStream;
  61. FBlockStream: TMemoryStream;
  62. ParentNode: TFIftiNode;
  63. OwnsParentNode : boolean;
  64. function AdjustedWord(AWord: String; out AOffset: Byte; AOldWord: String): String;
  65. procedure ChildIsFull(AWord: String; ANodeOffset: DWord); virtual; abstract;
  66. function GuessIfCanHold(AWord: String): Boolean; virtual; abstract;
  67. procedure Flush(NewBlockNeeded: Boolean); virtual; abstract;
  68. procedure FillRemainingSpace;
  69. function RemainingSpace: DWord;
  70. constructor Create(AStream: TStream);
  71. destructor Destroy; override;
  72. end;
  73. { TChmSearchWriter }
  74. TChmSearchWriter = class(TObject)
  75. private
  76. FHeaderRec: TFiftiMainHeader;
  77. FStream: TStream;
  78. FWordList: TIndexedWordList;
  79. FActiveLeafNode: TFIftiNode;
  80. procedure ProcessWords;
  81. procedure WriteHeader(IsPlaceHolder: Boolean);
  82. procedure WriteAWord(AWord: TIndexedWord);
  83. public
  84. procedure WriteToStream;
  85. constructor Create(AStream: TStream; AWordList: TIndexedWordList);
  86. destructor Destroy; override;
  87. end;
  88. { TChmSearchReader }
  89. TChmWLCTopic = record
  90. TopicIndex: DWord;
  91. LocationCodes: array of DWord;
  92. end;
  93. TChmWLCTopicArray = array of TChmWLCTopic;
  94. TChmSearchReader = class;
  95. TChmSearchReaderFoundDataEvent = procedure(Sender: TChmSearchReader; AWord: String; ATopic: DWord; AWordIndex: DWord) of object;
  96. TChmSearchReader = class(TObject)
  97. private
  98. FStream: TStream;
  99. FFileIsValid: Boolean;
  100. FFreeStreamOnDestroy: Boolean;
  101. FDocRootSize,
  102. FCodeCountRootSize,
  103. FLocCodeRootSize: Integer;
  104. FTreeDepth: Integer;
  105. FRootNodeOffset: DWord;
  106. FActiveNodeStart: DWord;
  107. FActiveNodeFreeSpace: Word;
  108. FNextLeafNode: DWord;
  109. procedure ReadCommonData;
  110. procedure MoveToFirstLeafNode;
  111. procedure MoveToRootNode;
  112. procedure MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
  113. function ReadWordOrPartialWord(ALastWord: String): String; // returns the whole word using the last word as a base
  114. function ReadIndexNodeEntry(ALastWord: String; out AWord: String; out ASubNodeStart: DWord): Boolean;
  115. function ReadLeafNodeEntry(ALastWord: String; out AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out AWLCOffset: DWord; out AWLCSize: DWord): Boolean;
  116. function ReadWLCEntries(AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord): TChmWLCTopicArray;
  117. public
  118. constructor Create(AStream: TStream; AFreeStreamOnDestroy: Boolean);
  119. destructor Destroy; override;
  120. procedure DumpData(AFoundDataEvent: TChmSearchReaderFoundDataEvent);
  121. function LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray): TChmWLCTopicArray;
  122. property FileIsValid: Boolean read FFileIsValid;
  123. end;
  124. const
  125. FIFTI_NODE_SIZE = 4096;
  126. implementation
  127. uses SysUtils, Math, ChmBase;
  128. type
  129. { TIndexNode }
  130. TIndexNode = class(TFIftiNode)
  131. function GuessIfCanHold(AWord: String): Boolean; override;
  132. procedure ChildIsFull ( AWord: String; ANodeOffset: DWord ); override;
  133. procedure Flush(NewBlockNeeded: Boolean); override;
  134. end;
  135. { TLeafNode }
  136. TLeafNode = class(TFIftiNode)
  137. FLeafNodeCount: DWord;
  138. FLastNodeStart: DWord;
  139. FreeSpace: DWord;
  140. FDocRootSize,
  141. FCodeRootSize,
  142. FLocRootSize: Byte;
  143. procedure WriteInitialHeader;
  144. Destructor Destroy; override;
  145. function GuessIfCanHold(AWord: String): Boolean; override;
  146. procedure Flush(NewBlockNeeded: Boolean); override;
  147. procedure AddWord(AWord: TIndexedWord);
  148. function WriteWLCEntries(AWord: TIndexedWord; ADocRootSize, ACodeRootSize, ALocRootSize: Byte): DWord;
  149. property LeafNodeCount: DWord read FLeafNodeCount;
  150. property DocRootSize: Byte read FDocRootSize write FDocRootSize;
  151. property CodeRootSize: Byte read FCodeRootSize write FCodeRootSize;
  152. property LocRootSize: Byte read FLocRootSize write FLocRootSize;
  153. end;
  154. function GetCompressedIntegerBE(Stream: TStream): DWord;
  155. var
  156. Buf: Byte;
  157. Value: Dword = 0;
  158. Shift: Integer = 0;
  159. begin
  160. repeat
  161. Buf := Stream.ReadByte;
  162. Value := Value or (Buf and $7F) shl Shift;
  163. Inc(Shift, 7);
  164. until (Buf and $80) = 0;
  165. Result := Value;
  166. end;
  167. procedure WriteCompressedIntegerBE(Stream: TStream; AInt: DWord);
  168. var
  169. Bits: Integer;
  170. Tmp: DWord;
  171. Buf: Byte;
  172. begin
  173. Tmp := AInt;
  174. Bits := 0;
  175. while Tmp <> 0 do
  176. begin
  177. Tmp := Tmp shr 1;
  178. Inc(Bits);
  179. end;
  180. repeat
  181. Buf := (AInt shr (Tmp * 7)) and $7F;
  182. if Bits > 7 then
  183. Buf := Buf or $80;
  184. Dec(Bits, 7);
  185. Inc(Tmp);
  186. Stream.WriteByte(Buf);
  187. until Bits <= 0;
  188. end;
  189. function WriteScaleRootInt(ANumber: DWord; out Bits: DWord; Root: Integer): Byte;
  190. var
  191. Tmp: DWord;
  192. Mask: DWord;
  193. // Scale: Integer;
  194. NeededBits: Integer;
  195. PrefixBits: Integer;
  196. RootBits: Integer;
  197. begin
  198. // Scale := 2;
  199. Bits := 0;
  200. Result := Root;
  201. Tmp := ANumber;
  202. NeededBits := 0;
  203. while Tmp <> 0 do
  204. begin
  205. Inc(NeededBits);
  206. Tmp := Tmp shr 1;
  207. end;
  208. PrefixBits := Max(0, NeededBits-Root);
  209. RootBits := NeededBits -1; //
  210. if RootBits < Root then
  211. RootBits := Root;
  212. if RootBits < 0 then
  213. RootBits := 0;
  214. Mask := 0;
  215. if RootBits-1 >= 0 then
  216. for Tmp := 0 to RootBits-1 do
  217. Mask := Mask or (DWord(1) shl Tmp);
  218. Bits := not Mask;
  219. Bits := Bits shl 1; // make space for empty bit
  220. Bits := Bits or (ANumber and Mask);
  221. Result := PrefixBits + 1 + RootBits;
  222. Bits := (Bits shl (32-Result)) shr (32 - Result);
  223. end;
  224. { TChmSearchWriter }
  225. procedure TChmSearchWriter.ProcessWords;
  226. begin
  227. FWordList.ForEach(@WriteAword);
  228. if FActiveLeafNode <> nil then
  229. FActiveLeafNode.Flush(False); // causes the unwritten parts of the tree to be written
  230. end;
  231. procedure TChmSearchWriter.WriteHeader ( IsPlaceHolder: Boolean ) ;
  232. var
  233. TmpNode: TFIftiNode;
  234. i: Integer;
  235. begin
  236. if IsPlaceHolder then
  237. begin
  238. FStream.Size := $400; // the header size. we will fill this after the nodes have been determined
  239. FStream.Position := $400;
  240. FillChar(PChar(TMemoryStream(FStream).Memory)^, $400, 0);
  241. FHeaderRec.DocIndexRootSize := 1;
  242. FHeaderRec.CodeCountRootSize := 1;
  243. FHeaderRec.LocationCodeRootSize := 4;
  244. Exit;
  245. end;
  246. // write the glorious header
  247. FHeaderRec.Sig[2] := $28;
  248. FHeaderRec.HTMLFilesCount := FWordList.IndexedFileCount;
  249. FHeaderRec.RootNodeOffset := FStream.Size-4096;
  250. FHeaderRec.LeafNodeCount := TLeafNode(FActiveLeafNode).LeafNodeCount;
  251. FHeaderRec.CopyOfRootNodeOffset := FHeaderRec.RootNodeOffset;
  252. FHeaderRec.TreeDepth := 0;
  253. TmpNode := FActiveLeafNode;
  254. while TmpNode <> nil do
  255. begin
  256. Inc(FHeaderRec.TreeDepth);
  257. TmpNode := TmpNode.ParentNode;
  258. end;
  259. FHeaderRec.DocIndexScale := 2;
  260. FHeaderRec.CodeCountScale := 2;
  261. FHeaderRec.LocationCodeScale := 2;
  262. //FHeaderRec.DocIndexRootSize := 15;
  263. //FHeaderRec.CodeCountRootSize := 15;
  264. //FHeaderRec.LocationCodeRootSize := 15;
  265. FHeaderRec.NodeSize := 4096;
  266. FHeaderRec.LongestWordLength := FWordList.LongestWord;
  267. FHeaderRec.TotalWordsIndexed := FWordList.TotalWordCount;
  268. FHeaderRec.TotalWords := FWordList.TotalDIfferentWords;
  269. FHeaderRec.TotalWordsLengthPart1 := FWordList.TotalWordLength;
  270. FHeaderRec.TotalWordsLength := FWordList.TotalDifferentWordLength;
  271. FHeaderRec.WindowsCodePage := 1252;
  272. FStream.Position := 0;
  273. FStream.Write(FHeaderRec.Sig[0], 4);
  274. FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount));
  275. FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset));
  276. FStream.WriteDWord(NtoLE(0)); // unknown 1
  277. FStream.WriteDWord(NtoLE(FHeaderRec.LeafNodeCount));
  278. FStream.WriteDWord(NtoLE(FHeaderRec.RootNodeOffset)); // yes twice
  279. FStream.WriteWord(NtoLE(FHeaderRec.TreeDepth));
  280. FStream.WriteDWord(NtoLE(DWord(7)));
  281. FStream.WriteByte(2);
  282. FStream.WriteByte(FHeaderRec.DocIndexRootSize);
  283. FStream.WriteByte(2);
  284. FStream.WriteByte(FHeaderRec.CodeCountRootSize);
  285. FStream.WriteByte(2);
  286. FStream.WriteByte(FHeaderRec.LocationCodeRootSize);
  287. // eat 10 bytes
  288. FStream.WriteWord(0);
  289. FStream.WriteDWord(0);
  290. FStream.WriteDWord(0);
  291. FStream.WriteDWord(NtoLE(FHeaderRec.NodeSize));
  292. FStream.WriteDWord(NtoLE(DWord(0)));
  293. FStream.WriteDWord(1);
  294. FStream.WriteDWord(5);
  295. FStream.WriteDWord(NtoLE(FHeaderRec.LongestWordLength));
  296. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsIndexed));
  297. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWords));
  298. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart1));
  299. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLengthPart2));
  300. FStream.WriteDWord(NtoLE(FHeaderRec.TotalWordsLength));
  301. FStream.WriteDWord(NtoLE(TLeafNode(FActiveLeafNode).FreeSpace));
  302. FStream.WriteDWord(NtoLE(0));
  303. FStream.WriteDWord(NtoLE(FHeaderRec.HTMLFilesCount-1));
  304. for i := 0 to 23 do FStream.WriteByte(0);
  305. FStream.WriteDWord(NtoLE(FHeaderRec.WindowsCodePage));
  306. FStream.WriteDWord(NtoLE(DWord(1033))); // LCID
  307. for i := 0 to 893 do FStream.WriteByte(0);
  308. end;
  309. procedure TChmSearchWriter.WriteAWord ( AWord: TIndexedWord ) ;
  310. begin
  311. if FActiveLeafNode = nil then
  312. begin
  313. FActiveLeafNode := TLeafNode.Create(FStream);
  314. with TLeafNode(FActiveLeafNode) do
  315. begin
  316. DocRootSize := FHeaderRec.DocIndexRootSize;
  317. CodeRootSize := FHeaderRec.CodeCountRootSize;
  318. LocRootSize := FHeaderRec.LocationCodeRootSize;
  319. end;
  320. end;
  321. if FActiveLeafNode.GuessIfCanHold(AWord.TheWord) = False then
  322. begin
  323. FActiveLeafNode.Flush(True);
  324. end;
  325. TLeafNode(FActiveLeafNode).AddWord(AWord);
  326. end;
  327. procedure TChmSearchWriter.WriteToStream;
  328. begin
  329. WriteHeader(True);
  330. ProcessWords;
  331. WriteHeader(False);
  332. end;
  333. constructor TChmSearchWriter.Create ( AStream: TStream;
  334. AWordList: TIndexedWordList ) ;
  335. begin
  336. FStream := AStream;
  337. FWordList := AWordList;
  338. FActiveLeafNode:=NIL;
  339. end;
  340. destructor TChmSearchWriter.Destroy;
  341. begin
  342. freeandnil(FActiveLeafNode);
  343. end;
  344. { TLeafNode }
  345. function TFIftiNode.RemainingSpace: DWord;
  346. begin
  347. Result := FIFTI_NODE_SIZE - FBlockStream.Position;
  348. end;
  349. constructor TFIftiNode.Create ( AStream: TStream ) ;
  350. begin
  351. inherited Create;
  352. FWriteStream := AStream;
  353. FBlockStream := TMemoryStream.Create;
  354. OwnsParentNode :=false;
  355. end;
  356. destructor TFIftiNode.Destroy;
  357. begin
  358. FBlockStream.Free;
  359. if OwnsParentNode then ParentNode.Free;
  360. inherited Destroy;
  361. end;
  362. procedure TFIftiNode.FillRemainingSpace;
  363. begin
  364. while RemainingSpace > 0 do
  365. FBlockStream.WriteByte(0);
  366. end;
  367. function TFIftiNode.AdjustedWord ( AWord: String; out AOffset: Byte; AOldWord: String ) : String;
  368. var
  369. Count1,
  370. Count2: Integer;
  371. Count: Integer;
  372. i: Integer;
  373. begin
  374. if AWord = AOldWord then
  375. begin
  376. AOffset := Length(AWord);
  377. Exit('');
  378. end;
  379. // else
  380. Count1 := Length(AOldWord);
  381. Count2 := Length(AWord);
  382. if Count1<Count2 then
  383. Count := Count1
  384. else
  385. Count := Count2;
  386. for i := 1 to Count do
  387. begin
  388. AOffset := i-1;
  389. if AOldWord[i] <> AWord[i]
  390. then Exit(Copy(AWord, i, Length(AWord)));
  391. end;
  392. Result := AWord;
  393. AOffset := 0;
  394. end;
  395. procedure TLeafNode.WriteInitialHeader;
  396. begin
  397. FBlockStream.WriteDWord(0);
  398. FBlockStream.WriteWord(0);
  399. FBlockStream.WriteWord(0);
  400. end;
  401. destructor TLeafNode.Destroy;
  402. begin
  403. inherited Destroy;
  404. end;
  405. function TLeafNode.GuessIfCanHold ( AWord: String ) : Boolean;
  406. var
  407. WordOffset: Byte;
  408. begin
  409. Result := 17 + Length(AdjustedWord(AWord, WordOffset, FLastWord)) < RemainingSpace;
  410. end;
  411. procedure TLeafNode.Flush(NewBlockNeeded: Boolean);
  412. var
  413. FTmpPos: DWord;
  414. begin
  415. Inc(FLeafNodeCount);
  416. FTmpPos := FWriteStream.Position;
  417. // update the previous leaf node about our position.
  418. if FLastNodeStart > 0 then
  419. begin
  420. FWriteStream.Position := FLastNodeStart;
  421. FWriteStream.WriteDWord(NtoLE(FTmpPos));
  422. FWriteStream.Position := FTmpPos;
  423. end;
  424. FLastNodeStart := FTmpPos;
  425. FreeSpace := RemainingSpace;
  426. FillRemainingSpace;
  427. // update the leaf header to show the available space.
  428. FBlockStream.Position := 6;
  429. FBlockStream.WriteWord(NtoLE(Word(FreeSpace)));
  430. // copy the leaf block to the fiftimain file
  431. FBlockStream.Position := 0;
  432. FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
  433. FBlockStream.Position := 0;
  434. if NewBlockNeeded or ((NewBlockNeeded = False) and (ParentNode <> nil)) then
  435. begin
  436. if ParentNode = nil then
  437. begin
  438. ParentNode := TIndexNode.Create(FWriteStream);
  439. OwnsParentNode:=True;
  440. end;
  441. ParentNode.ChildIsFull(FLastWord, FLastNodeStart);
  442. if (NewBlockNeeded = False) then
  443. ParentNode.Flush(False);
  444. end;
  445. FLastWord := '';
  446. end;
  447. procedure TLeafNode.AddWord ( AWord: TIndexedWord ) ;
  448. var
  449. Offset: Byte;
  450. NewWord: String;
  451. WLCSize: DWord;
  452. begin
  453. if Length(AWord.TheWord) > 99 then
  454. Exit; // Maximum word length is 99
  455. if FBlockStream.Position = 0 then
  456. WriteInitialHeader;
  457. NewWord := AdjustedWord(AWord.TheWord, Offset, FLastWord);
  458. FLastWord := AWord.TheWord;
  459. FBlockStream.WriteByte(Length(NewWord)+1);
  460. FBlockStream.WriteByte(Offset);
  461. FBlockStream.Write(NewWord[1], Length(Trim(NewWord)));
  462. FBlockStream.WriteByte(Ord(AWord.IsTitle));
  463. WriteCompressedIntegerBE(FBlockStream, AWord.DocumentCount);
  464. FBlockStream.WriteDWord(NtoLE(DWord(FWriteStream.Position)));
  465. FBlockStream.WriteWord(0);
  466. // write WLC to FWriteStream so we can write the size of the wlc entries
  467. WLCSize := WriteWLCEntries(AWord, FDocRootSize, FCodeRootSize, FLocRootSize);
  468. WriteCompressedIntegerBE(FBlockStream, WLCSize);
  469. end;
  470. function Min(AValue, BValue: Byte): Byte;
  471. begin
  472. if AValue < BValue then
  473. Result := AValue
  474. else Result := BValue;
  475. end;
  476. function Max(AValue, BValue: Byte): Byte;
  477. begin
  478. if AValue > BValue then
  479. Result := AValue
  480. else Result := BValue;
  481. end;
  482. function Max(AValue, BValue: Integer): Integer;
  483. begin
  484. if AValue > BValue then
  485. Result := AValue
  486. else Result := BValue;
  487. end;
  488. function Max(AValue, BValue: DWord): DWord;
  489. begin
  490. if AValue > BValue then
  491. Result := AValue
  492. else Result := BValue;
  493. end;
  494. function TLeafNode.WriteWLCEntries ( AWord: TIndexedWord ; ADocRootSize, ACodeRootSize, ALocRootSize: Byte) : DWord;
  495. var
  496. LastDocIndex: DWord;
  497. LastLocCode: DWord;
  498. UsedBits: Byte;
  499. Buf: Byte;
  500. function NewDocDelta(ADocIndex: DWord): DWord;
  501. begin
  502. Result := ADocIndex - LastDocIndex;
  503. LastDocIndex := ADocIndex;
  504. end;
  505. function NewLocCode(ALocCode: DWord): DWord;
  506. begin
  507. Result := ALocCode - LastLocCode;
  508. LastLocCode := ALocCode;
  509. end;
  510. procedure AddValue(AValue: DWord; BitCount: Byte);
  511. var
  512. NeededBits: Byte;
  513. Tmp: Byte;
  514. begin
  515. AValue := AValue shl (32 - BitCount);
  516. while BitCount > 0 do
  517. begin
  518. NeededBits := 8 - UsedBits;
  519. Tmp := Hi(Hi(DWord(AValue shr (UsedBits))));
  520. Buf := Buf or Tmp;
  521. Inc(UsedBits, Min(BitCount, NeededBits));
  522. AValue := AValue shl Min(BitCount, NeededBits);
  523. Dec(BitCount, Min(BitCount, NeededBits));
  524. if (UsedBits = 8) then
  525. begin
  526. FWriteStream.WriteByte(Buf);
  527. UsedBits := 0;
  528. NeededBits := 0;
  529. Buf := 0;
  530. end;
  531. end;
  532. end;
  533. procedure FlushBuffer;
  534. begin
  535. if UsedBits > 0 then
  536. FWriteStream.WriteByte(Buf);
  537. UsedBits := 0;
  538. Buf := 0;
  539. end;
  540. var
  541. DocDelta: DWord;
  542. LocDelta: DWord;
  543. StartPos: DWord;
  544. Bits: DWord;
  545. BitCount: Byte;
  546. i,
  547. j: Integer;
  548. Doc: TIndexDocument;
  549. // proced
  550. begin
  551. StartPos := FWriteStream.Position;
  552. LastDocIndex := 0;
  553. UsedBits := 0;
  554. Buf := 0;
  555. for i := 0 to AWord.DocumentCount-1 do
  556. begin
  557. LastLocCode := 0;
  558. Doc := AWord.GetLogicalDocument(i);
  559. DocDelta := NewDocDelta(Doc.DocumentIndex);
  560. BitCount := WriteScaleRootInt(DocDelta, Bits, ADocRootSize);
  561. AddValue(Bits, BitCount);
  562. BitCount := WriteScaleRootInt(Doc.NumberOfIndexEntries, Bits, ACodeRootSize);
  563. AddValue(Bits, BitCount);
  564. for j := 0 to Doc.NumberOfIndexEntries-1 do
  565. begin
  566. LocDelta := NewLocCode(Doc.IndexEntry[j]);
  567. BitCount := WriteScaleRootInt(LocDelta, Bits, ALocRootSize);
  568. AddValue(Bits, BitCount);
  569. end;
  570. FlushBuffer;
  571. end;
  572. Result := FWriteStream.Position-StartPos;
  573. end;
  574. { TIndexNode }
  575. function TIndexNode.GuessIfCanHold ( AWord: String ) : Boolean;
  576. var
  577. Offset: Byte;
  578. begin
  579. Result := FBlockStream.Position + 8 + Length(AdjustedWord(AWord, Offset, FLastWord)) < FIFTI_NODE_SIZE;
  580. end;
  581. procedure TIndexNode.ChildIsFull ( AWord: String; ANodeOffset: DWord ) ;
  582. var
  583. Offset: Byte;
  584. begin
  585. if FBlockStream.Position = 0 then
  586. FBlockStream.WriteWord(0); // free space at end. updated when the block is flushed
  587. if GuessIfCanHold(AWord) = False then
  588. Flush(True);
  589. AWord := AdjustedWord(AWord, Offset, FLastWord);
  590. // Write the Index node Entry
  591. FBlockStream.WriteByte(Length(AWord)+1);
  592. FBlockStream.WriteByte(Offset);
  593. FBlockStream.Write(AWord[1], Length(AWord));
  594. FBlockStream.WriteDWord(NtoLE(ANodeOffset));
  595. FBlockStream.WriteWord(0);
  596. end;
  597. procedure TIndexNode.Flush ( NewBlockNeeded: Boolean ) ;
  598. var
  599. RemSize: DWord;
  600. begin
  601. if NewBlockNeeded then
  602. begin
  603. if ParentNode = nil then
  604. begin
  605. ParentNode := TIndexNode.Create(FWriteStream);
  606. OwnsParentNode:=True;
  607. end;
  608. end;
  609. if ParentNode <> nil then
  610. ParentNode.ChildIsFull(FLastWord, FWriteStream.Position);
  611. RemSize := RemainingSpace;
  612. FillRemainingSpace;
  613. FBlockStream.Position := 0;
  614. FBlockStream.WriteWord(NtoLE(RemSize));
  615. FBlockStream.Position := 0;
  616. FWriteStream.CopyFrom(FBlockStream, FIFTI_NODE_SIZE);
  617. FLastWord := '';
  618. if NewBlockNeeded then
  619. FBlockStream.WriteDWord(0) // placeholder to write free space in when block is full
  620. else
  621. if ParentNode <> nil then
  622. ParentNode.Flush(NewBlockNeeded);
  623. end;
  624. { TChmSearchReader }
  625. procedure TChmSearchReader.ReadCommonData;
  626. var
  627. Sig: DWord;
  628. begin
  629. FStream.Position := 0;
  630. Sig := LEtoN(FStream.ReadDWord);
  631. FFileIsValid := Sig = $00280000;
  632. if not FileIsValid then
  633. Exit;
  634. // root node address
  635. FStream.Position := $8;
  636. FRootNodeOffset := LEtoN(FStream.ReadDWord);
  637. // Tree Depth
  638. FStream.Position := $18;
  639. FTreeDepth := LEtoN(FStream.ReadWord);
  640. // Root sizes for scale and root integers
  641. FStream.Position := $1E;
  642. if FStream.ReadByte <> 2 then // we only can read the files when scale is 2
  643. FFileIsValid := False;
  644. FDocRootSize := FStream.ReadByte;
  645. if FStream.ReadByte <> 2 then
  646. FFileIsValid := False;
  647. FCodeCountRootSize := FStream.ReadByte;
  648. if FStream.ReadByte <> 2 then
  649. FFileIsValid := False;
  650. FLocCodeRootSize := FStream.ReadByte;
  651. end;
  652. procedure TChmSearchReader.MoveToFirstLeafNode;
  653. var
  654. NodeDepth: Integer;
  655. NodeOffset: DWord;
  656. LastWord: String;
  657. NewWord: String;
  658. begin
  659. NodeDepth := FTreeDepth;
  660. MoveToRootNode;
  661. while NodeDepth > 1 do
  662. begin
  663. LastWord := '';
  664. ReadIndexNodeEntry(LastWord, NewWord, NodeOffset);
  665. Dec(NodeDepth);
  666. MoveToNode(NodeOffset, NodeDepth);
  667. end;
  668. end;
  669. procedure TChmSearchReader.MoveToRootNode;
  670. begin
  671. MoveToNode(FRootNodeOffset, FTreeDepth);
  672. end;
  673. procedure TChmSearchReader.MoveToNode(ANodeOffset: DWord; ANodeDepth: Integer);
  674. begin
  675. FStream.Position := ANodeOffset;
  676. FActiveNodeStart := FStream.Position;
  677. if ANodeDepth > 1 then
  678. begin
  679. FnextLeafNode := 0;
  680. FActiveNodeFreeSpace := LEtoN(FStream.ReadWord); // empty space at end of node
  681. end
  682. else
  683. begin
  684. FnextLeafNode := LEtoN(FStream.ReadDWord);
  685. FStream.ReadWord;
  686. FActiveNodeFreeSpace := LEtoN(FStream.ReadWord);
  687. end;
  688. end;
  689. function TChmSearchReader.ReadWordOrPartialWord ( ALastWord: String ) : String;
  690. var
  691. WordLength: Integer;
  692. CopyLastWordCharCount: Integer;
  693. begin
  694. WordLength := FStream.ReadByte;
  695. CopyLastWordCharCount := FStream.ReadByte;
  696. if CopyLastWordCharCount > 0 then
  697. Result := Copy(ALastWord, 1, CopyLastWordCharCount);
  698. SetLength(Result, (WordLength-1) + CopyLastWordCharCount);
  699. FStream.Read(Result[1+CopyLastWordCharCount], WordLength-1);
  700. end;
  701. function TChmSearchReader.ReadIndexNodeEntry (ALastWord: String; out AWord: String; out
  702. ASubNodeStart: DWord ): Boolean;
  703. begin
  704. Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  705. if not Result then
  706. Exit;
  707. AWord := ReadWordOrPartialWord(ALastWord);
  708. ASubNodeStart := LEtoN(FStream.ReadDWord);
  709. FStream.ReadWord;
  710. end;
  711. function TChmSearchReader.ReadLeafNodeEntry ( ALastWord: String; out
  712. AWord: String; out AInTitle: Boolean; out AWLCCount: DWord; out
  713. AWLCOffset: DWord; out AWLCSize: DWord ): Boolean;
  714. begin
  715. Result := FStream.Position - FActiveNodeStart < FIFTI_NODE_SIZE - FActiveNodeFreeSpace;
  716. if not Result then
  717. Exit;
  718. AWord := ReadWordOrPartialWord(ALastWord);
  719. AInTitle := FStream.ReadByte = 1;
  720. AWLCCount := GetCompressedIntegerBE(FStream);
  721. AWLCOffset := LEtoN(FStream.ReadDWord);
  722. FStream.ReadWord;
  723. AWLCSize := GetCompressedIntegerBE(FStream);
  724. end;
  725. function TChmSearchReader.ReadWLCEntries (AWLCCount: DWord; AWLCOffset: DWord; AWLCSize: DWord ) : TChmWLCTopicArray;
  726. function AtEndOfWLCEntries: Boolean;
  727. begin
  728. Result := AWLCOffset + AWLCSize <= FStream.Position;
  729. end;
  730. var
  731. Buf: Byte;
  732. BitsInBuffer: Integer;
  733. procedure FillBuffer;
  734. begin
  735. while (BitsInBuffer = 0) and not AtEndOfWLCEntries do
  736. begin
  737. Buf := FStream.ReadByte;
  738. Inc(BitsInBuffer, 8);
  739. end;
  740. end;
  741. function ReadWLC(RootSize: DWord): DWord;
  742. var
  743. PrefixBits: Integer = 0;
  744. BitCount: Integer = 0;
  745. RemainingBits: Integer; // only the bits for this number not the bits in buffer
  746. begin
  747. FillBuffer;
  748. Result := 0;
  749. while (Buf and $80) > 0 do // find out how many prefix bits there are
  750. begin
  751. Inc(PrefixBits);
  752. Buf := Buf shl 1;
  753. Dec(BitsInBuffer);
  754. FillBuffer;
  755. end;
  756. if PrefixBits > 0 then
  757. Result := 1;
  758. Inc(BitCount, PrefixBits+1);
  759. Buf := Buf shl 1;
  760. Dec(BitsInBuffer);
  761. FillBuffer;
  762. Remainingbits := RootSize + Max(Integer(PrefixBits-1), 0);
  763. while RemainingBits > 0 do
  764. begin
  765. Result := Result shl 1;
  766. Result := Result or (Buf shr 7);
  767. Dec(RemainingBits);
  768. Buf := Buf shl 1;
  769. Dec(BitsInBuffer);
  770. FillBuffer;
  771. Inc(BitCount);
  772. end;
  773. end;
  774. procedure ClearBuffer;
  775. begin
  776. BitsInBuffer := 0;
  777. Buf := 0;
  778. end;
  779. var
  780. TopicHits: DWord;
  781. i: Integer;
  782. j: Integer;
  783. CachedStreamPos: QWord;
  784. LastDoc,
  785. LastLocCode: DWord;
  786. begin
  787. CachedStreamPos := FStream.Position;
  788. FStream.Position := AWLCOffset;
  789. {for i := 0 to AWLCSize-1 do
  790. begin
  791. Buf := FStream.ReadByte;
  792. Write(binStr(Buf, 8), ' ');
  793. end;}
  794. FStream.Position := AWLCOffset;
  795. SetLength(Result, AWLCCount);
  796. Buf := 0;
  797. BitsInBuffer := 0;
  798. LastDoc := 0;
  799. for i := 0 to AWLCCount-1 do
  800. begin
  801. Result[i].TopicIndex := ReadWLC(FDocRootSize) + LastDoc;
  802. LastDoc := Result[i].TopicIndex;
  803. TopicHits := ReadWLC(FCodeCountRootSize);
  804. SetLength(Result[i].LocationCodes, TopicHits);
  805. LastLocCode := 0;
  806. for j := 0 to TopicHits-1 do
  807. begin
  808. Result[i].LocationCodes[j] := ReadWLC(FLocCodeRootSize) + LastLocCode;
  809. LastLocCode := Result[i].LocationCodes[j];
  810. end;
  811. ClearBuffer;
  812. end;
  813. FStream.Position := CachedStreamPos;
  814. end;
  815. constructor TChmSearchReader.Create ( AStream: TStream;
  816. AFreeStreamOnDestroy: Boolean ) ;
  817. begin
  818. FStream := AStream;
  819. FFreeStreamOnDestroy := AFreeStreamOnDestroy;
  820. ReadCommonData;
  821. end;
  822. destructor TChmSearchReader.Destroy;
  823. begin
  824. if FFreeStreamOnDestroy then
  825. FreeAndNil(FStream);
  826. inherited Destroy;
  827. end;
  828. procedure TChmSearchReader.DumpData (
  829. AFoundDataEvent: TChmSearchReaderFoundDataEvent ) ;
  830. var
  831. LastWord: String;
  832. TheWord: String;
  833. InTitle: Boolean;
  834. WLCCount: DWord;
  835. WLCOffset: DWord;
  836. WLCSize: DWord;
  837. FoundHits: TChmWLCTopicArray;
  838. i: Integer;
  839. j: Integer;
  840. begin
  841. MoveToFirstLeafNode;
  842. LastWord := '';
  843. repeat
  844. if (ReadLeafNodeEntry(LastWord, TheWord, InTitle, WLCCount, WLCOffset, WLCSize) = False) then
  845. begin
  846. if FnextLeafNode <> 0 then
  847. begin
  848. MoveToNode(FnextLeafNode, 1);
  849. LastWord := '';
  850. end
  851. else
  852. Break;
  853. end
  854. else begin
  855. LastWord := TheWord;
  856. //WriteLn('Reading Hits for ', TheWord ,' at ', hexstr(WLCOffset,8) );
  857. FoundHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  858. //WriteLn('DONE Reading Hits for ', TheWord);
  859. // AFoundDataEvent(Self, TheWord, 0,0);//FoundHits[i].TopicIndex ,-1);//FoundHits[i].LocationCodes[j]);
  860. for i := 0 to High(FoundHits) do
  861. for j := 0 to High(FoundHits[i].LocationCodes) do
  862. AFoundDataEvent(Self, TheWord, FoundHits[i].TopicIndex ,FoundHits[i].LocationCodes[j]);
  863. end;
  864. until False; //FStream.Position - FActiveNodeStart >= FIFTI_NODE_SIZE - FActiveNodeFreeSpace
  865. end;
  866. function TChmSearchReader.LookupWord(AWord: String; out ATitleHits: TChmWLCTopicArray): TChmWLCTopicArray;
  867. var
  868. LastWord: String;
  869. NewWord: String;
  870. NodeLevel: Integer;
  871. NewNodePosition: DWord;
  872. InTitle: Boolean;
  873. WLCCount: DWord;
  874. WLCOffset: DWord;
  875. WLCSize: DWord;
  876. CompareResult: Integer;
  877. ReadNextResult: Boolean;
  878. begin
  879. AWord := LowerCase(AWord);
  880. NodeLevel := FTreeDepth;
  881. MoveToRootNode;
  882. SetLength(Result, 0);
  883. LastWord := '';
  884. // descend the index node tree until we find the leafnode
  885. while NodeLevel > 1 do begin
  886. //WriteLn('At Node Level ', NodeLevel);
  887. if ReadIndexNodeEntry(LastWord, NewWord, NewNodePosition) <> False then
  888. begin
  889. //WriteLn('Found Index Entry: ', NewWord, ' Comparing to ', AWord);
  890. if ChmCompareText(NewWord, AWord) >= 0 then
  891. begin
  892. LastWord := '';
  893. Dec(NodeLevel);
  894. MoveToNode(NewNodePosition, NodeLevel);
  895. end;
  896. end
  897. else
  898. Break;
  899. end;
  900. if NodeLevel > 1 then
  901. Exit; // the entry we are looking for is > than the last entry of the last index node
  902. // now we are in a leafnode
  903. while ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize) <> False do
  904. begin
  905. //WriteLn('Found Leaf Entry: ', NewWord, ' Comparing to ', AWord);
  906. LastWord := NewWord;
  907. CompareResult := ChmCompareText(AWord, NewWord);
  908. if CompareResult < 0 then
  909. Exit;
  910. if CompareResult = 0 then
  911. begin
  912. if InTitle then
  913. ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
  914. else
  915. Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  916. // check if the next entry is the same word since there is an entry for titles and for body
  917. if (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize)) then
  918. ReadNextResult := True
  919. else if (FNextLeafNode <> 0) then
  920. begin
  921. MoveToNode(FNextLeafNode, 1);
  922. LastWord := '';
  923. ReadNextResult := (ReadLeafNodeEntry(LastWord, NewWord, InTitle, WLCCount, WLCOffset, WLCSize));
  924. end;
  925. if ReadNextResult and (NewWord = AWord) then
  926. begin
  927. if InTitle then
  928. ATitleHits := ReadWLCEntries(WLCCount, WLCOffset, WLCSize)
  929. else
  930. Result := ReadWLCEntries(WLCCount, WLCOffset, WLCSize);
  931. end;
  932. Exit;
  933. end;
  934. end;
  935. end;
  936. end.