htmlindexer.pas 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. { Copyright (C) <2008> <Andrew Haines> htmlindexer.pas
  2. This library is free software; you can redistribute it and/or modify it
  3. under the terms of the GNU Library General Public License as published by
  4. the Free Software Foundation; either version 2 of the License, or (at your
  5. option) any later version.
  6. This program is distributed in the hope that it will be useful, but WITHOUT
  7. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  8. FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
  9. for more details.
  10. You should have received a copy of the GNU Library General Public License
  11. along with this library; if not, write to the Free Software Foundation,
  12. Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  13. }
  14. {
  15. See the file COPYING.FPC, included in this distribution,
  16. for details about the copyright.
  17. }
  18. unit HTMLIndexer;
  19. {$MODE OBJFPC}{$H+}
  20. interface
  21. uses Classes, SysUtils, FastHTMLParser,{$ifdef userb}fos_redblacktree_gen{$else}avl_tree{$endif};
  22. Type
  23. { TIndexDocument }
  24. TIndexDocument = class(TObject)
  25. private
  26. FDocumentIndex: Integer;
  27. FLastEntry : Integer;
  28. WordIndex: array of Integer;
  29. function getindexentries:integer;
  30. public
  31. function GetWordIndex(i:integer):integer; inline;
  32. procedure AddWordIndex(AIndex: Integer);
  33. constructor Create(ADocumentIndex: Integer);
  34. property DocumentIndex: Integer read FDocumentIndex;
  35. property IndexEntry[i:integer] : Integer read GetWordIndex;
  36. property NumberofIndexEntries : integer read getindexentries;
  37. end;
  38. { TIndexedWord }
  39. TIndexedWord = class(TObject)
  40. private
  41. FIsTitle: Boolean;
  42. FTheWord: string;
  43. FCachedTopic: TIndexDocument;
  44. FDocuments: Array of TIndexDocument;
  45. function GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
  46. function GetDocumentCount: Integer;
  47. public
  48. constructor Create(AWord: String; AIsTitle: Boolean);
  49. destructor Destroy; override;
  50. function GetLogicalDocument(AIndex: Integer): TIndexDocument;
  51. property TheWord: string read FTheWord write ftheword; // Always lowercase
  52. property DocumentTopic[TopicIndexNum: Integer]: TIndexDocument read GetDocument;
  53. property DocumentCount: Integer read GetDocumentCount;
  54. property IsTitle: Boolean read FIsTitle write fistitle;
  55. end;
  56. { TIndexedWordList }
  57. {$ifdef userb}
  58. TRBIndexTree = specialize TGFOS_RBTree<String,TIndexedWord>;
  59. {$endif}
  60. TForEachMethod = procedure (AWord:TIndexedWord) of object;
  61. TForEachProcedure = Procedure (AWord:TIndexedWord;state:pointer);
  62. TIndexedWordList = class(TObject)
  63. private
  64. FIndexTitlesOnly: Boolean;
  65. FIndexedFileCount: DWord;
  66. //vars while processing page
  67. FInTitle,
  68. FInBody: Boolean;
  69. FWordCount: Integer; // only words in body
  70. FDocTitle: String;
  71. FTopicIndex: Integer;
  72. //end vars
  73. FTotalDifferentWordLength: DWord;
  74. FTotalDIfferentWords: DWord;
  75. FTotalWordCount: DWord;
  76. FTotalWordLength: DWord;
  77. FLongestWord: DWord;
  78. FParser: THTMLParser;
  79. {$ifdef userb}
  80. FAVLTree : TRBIndexTree;
  81. {$else}
  82. FAVLTree : TAVLTree;
  83. Spare :TIndexedWord;
  84. {$endif}
  85. function AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
  86. // callbacks
  87. procedure CBFoundTag(NoCaseTag, ActualTag: string);
  88. procedure CBFountText(Text: string);
  89. procedure EatWords(Words: String; IsTitle: Boolean);
  90. public
  91. constructor Create;
  92. destructor Destroy; override;
  93. function IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String; // returns the documents <Title>
  94. procedure Clear;
  95. procedure AddWord(const AWord: TIndexedWord);
  96. procedure ForEach(Proc:TForEachMethod);
  97. procedure ForEach(Proc:TForEachProcedure;state:pointer);
  98. property IndexedFileCount: DWord read FIndexedFileCount;
  99. property LongestWord: DWord read FLongestWord;
  100. property TotalWordCount: DWord read FTotalWordCount;
  101. property TotalDIfferentWords: DWord read FTotalDIfferentWords;
  102. property TotalWordLength: DWord read FTotalWordLength;
  103. property TotalDifferentWordLength: DWord read FTotalDifferentWordLength;
  104. property Words[AWord: String; IsTitle: Boolean] : TIndexedWord read AddGetWord;
  105. end;
  106. implementation
  107. Const GrowSpeed = 10;
  108. function Max(ANumber, BNumber: DWord): DWord;
  109. begin
  110. if ANumber > BNumber then
  111. Result := ANumber
  112. else
  113. Result := BNumber;
  114. end;
  115. const titlexlat : array [boolean] of char = ('0','1');
  116. function makekey( n : string;istitle:boolean):string; inline;
  117. begin
  118. result:=n+'___'+titlexlat[istitle];
  119. end;
  120. Function CompareProcObj(Node1, Node2: Pointer): integer;
  121. var n1,n2 : TIndexedWord;
  122. begin
  123. n1:=TIndexedWord(Node1); n2:=TIndexedWord(Node2);
  124. Result := CompareText(n1.theword, n2.theword);
  125. if Result = 0 then
  126. begin
  127. Result := ord(n2.IsTitle)-ord(n1.IsTitle);
  128. end;
  129. if Result < 0 then Result := -1
  130. else if Result > 0 then Result := 1;
  131. end;
  132. { TIndexedWordList }
  133. function TIndexedWordList.AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
  134. var
  135. {$ifdef userb}
  136. key : string;
  137. {$else}
  138. n : TAVLTreeNode;
  139. {$endif}
  140. begin
  141. Result := nil;
  142. AWord := LowerCase(AWord);
  143. {$ifdef userb}
  144. key:=makekey(aword,istitle);
  145. if not favltree.Find(key,result) then result:=nil;;
  146. {$else}
  147. if not assigned(spare) then
  148. spare:=TIndexedWord.Create(AWord,IsTitle)
  149. else
  150. begin
  151. spare.TheWord:=aword;
  152. spare.IsTitle:=IsTitle;
  153. end;
  154. n:=favltree.FindKey(Spare,@CompareProcObj);
  155. if assigned(n) then
  156. result:=TIndexedWord(n.Data);
  157. {$endif}
  158. if Result = nil then
  159. begin
  160. Inc(FTotalDifferentWordLength, Length(AWord));
  161. Inc(FTotalDIfferentWords);
  162. {$ifdef userb}
  163. result:=TIndexedWord.Create(AWord,IsTitle);
  164. favltree.add(key,result);
  165. {$else}
  166. Result := spare; // TIndexedWord.Create(AWord,IsTitle);
  167. spare:=nil;
  168. AddWord(Result);
  169. {$endif}
  170. // if IsTitle then
  171. //WriteLn('Creating word: ', AWord);
  172. FLongestWord := Max(FLongestWord, Length(AWord));
  173. end;
  174. Inc(FTotalWordLength, Length(AWord));
  175. Inc(FTotalWordCount);
  176. end;
  177. procedure TIndexedWordList.CBFoundTag(NoCaseTag, ActualTag: string);
  178. begin
  179. if FInBody then begin
  180. if NoCaseTag = '</BODY>' then FInBody := False;
  181. end
  182. else begin
  183. //WriteLn('"',NoCaseTag,'"');
  184. if NoCaseTag = '<TITLE>' then FInTitle := True
  185. else if NoCaseTag = '</TITLE>' then FInTitle := False
  186. else if NoCaseTag = '<BODY>' then FInBody := True
  187. else
  188. end;
  189. if FInBody and FIndexTitlesOnly then FParser.Done := True;
  190. end;
  191. procedure TIndexedWordList.CBFountText(Text: string);
  192. begin
  193. if Length(Text) < 1 then
  194. Exit;
  195. if (not FInTitle) and (not FInBody) then
  196. Exit;
  197. EatWords(Text, FInTitle and not FInBody);
  198. end;
  199. procedure TIndexedWordList.EatWords ( Words: String; IsTitle: Boolean ) ;
  200. var
  201. WordPtr: PChar;
  202. WordStart: PChar;
  203. InWord: Boolean;
  204. IsNumberWord: Boolean;
  205. function IsEndOfWord: Boolean;
  206. begin
  207. Result := not (WordPtr^ in ['a'..'z', '0'..'9', #01, #$DE, #$FE]);
  208. if Result and IsNumberWord then
  209. Result := Result and (WordPtr[0] <> '.');
  210. if Result and InWord then
  211. Result := Result and (WordPtr[0] <> '''');
  212. ;
  213. end;
  214. var
  215. WordIndex: TIndexedWord;
  216. WordName: String;
  217. FPos: Integer;
  218. begin
  219. if IsTitle then
  220. FDocTitle := Words;
  221. Words := LowerCase(Words);
  222. WordStart := PChar(Words);
  223. WordPtr := WordStart;
  224. IsNumberWord := False;
  225. InWord := False;
  226. repeat
  227. if InWord and IsEndOfWord then
  228. begin
  229. WordName := Copy(WordStart, 0, (WordPtr-WordStart));
  230. FPos := Pos('''', WordName);
  231. while FPos > 0 do
  232. begin
  233. Delete(WordName, FPos, 1);
  234. FPos := Pos('''', WordName);
  235. end;
  236. WordIndex := addgetword(wordname,istitle);
  237. InWord := False;
  238. IsNumberWord := False;
  239. WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
  240. //if not IsTitle then
  241. Inc(FWordCount);
  242. end
  243. else if not InWord and not IsEndOfWord then
  244. begin
  245. InWord := True;
  246. WordStart := WordPtr;
  247. IsNumberWord := WordPtr^ in ['0'..'9'];
  248. end;
  249. Inc(WordPtr);
  250. until WordPtr^ = #0;
  251. if InWord then
  252. begin
  253. WordName := Copy(WordStart, 0, (WordPtr-WordStart));
  254. try
  255. WordIndex := addgetword(wordname,istitle); // Self.Words[WordName, IsTitle];
  256. except on e:exception do writeln('Error: ', wordname); end;
  257. WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
  258. InWord := False;
  259. //if IsNumberWord then WriteLn('Following is NUMBER WORD: "', (WordStart[0]),'"'); ;
  260. IsNumberWord := False;
  261. //WriteLn(FWordCount, ' "', WordName,'"');
  262. if not IsTitle then
  263. Inc(FWordCount);
  264. end;
  265. end;
  266. function defaultindexedword : TIndexedWord;
  267. begin
  268. result:=Tindexedword.create('',false);
  269. end;
  270. constructor TIndexedWordList.Create;
  271. begin
  272. inherited;
  273. {$ifdef userb}
  274. FAVLTree :=TRBIndexTree.create(@default_rb_string_compare,
  275. @defaultindexedword,
  276. @default_rb_string_undef );
  277. {$else}
  278. favltree:=TAVLTree.Create(@CompareProcObj);
  279. spare:=nil;
  280. {$endif}
  281. end;
  282. procedure FreeObject(const Obj:TIndexedWord);
  283. begin
  284. obj.free;
  285. end;
  286. destructor TIndexedWordList.Destroy;
  287. begin
  288. clear;
  289. {$ifndef userb}
  290. if assigned(spare) then spare.free;
  291. {$endif}
  292. favltree.free;
  293. inherited Destroy;
  294. end;
  295. function TIndexedWordList.IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String;
  296. var
  297. TheFile: String;
  298. begin
  299. FInBody := False;
  300. FInTitle:= False;
  301. FIndexTitlesOnly := AIndexOnlyTitles;
  302. FWordCount := 0;
  303. FTopicIndex := ATOPICIndex;
  304. FIndexedFileCount := FIndexedFileCount +1;
  305. SetLength(TheFile, AStream.Size+1);
  306. AStream.Position := 0;
  307. AStream.Read(TheFile[1], AStream.Size);
  308. TheFile[Length(TheFile)] := #0;
  309. FParser := THTMLParser.Create(@TheFile[1]);
  310. FParser.OnFoundTag := @CBFoundTag;
  311. FParser.OnFoundText := @CBFountText;
  312. FParser.Exec;
  313. FParser.Free;
  314. Result := FDocTitle;
  315. FDocTitle := '';
  316. FInBody := False;
  317. FInTitle:= False;
  318. FWordCount := 0;
  319. FTopicIndex := -1;
  320. AStream.Position := 0;
  321. end;
  322. procedure TIndexedWordList.Clear;
  323. begin
  324. {$ifdef userb}
  325. fAvlTree.ClearN(@FreeObject);
  326. {$else}
  327. fAvlTree.FreeAndClear;
  328. {$endif}
  329. end;
  330. procedure TIndexedWordList.AddWord(const AWord: TIndexedWord);
  331. begin
  332. {$ifdef userb}
  333. favltree.add(makekey(aword.theword,aword.istitle),AWord);
  334. {$else}
  335. favltree.add(aword);
  336. {$endif}
  337. end;
  338. procedure TIndexedWordList.ForEach(Proc:TForEachMethod);
  339. {$ifdef userb}
  340. var key : string;
  341. val:TIndexedWord;
  342. {$else}
  343. var
  344. AVLNode : TAVLTreeNode;
  345. {$endif}
  346. begin
  347. {$ifdef userb}
  348. if favltree.FirstNode(key,val) then
  349. begin // Scan it forward
  350. repeat
  351. proc(val);
  352. until not favltree.FindNext(key,val);
  353. end;
  354. {$else}
  355. AVLNode:=fAVLTree.FindLowest;
  356. while (AVLNode<>nil) do
  357. begin
  358. Proc(TIndexedWord(AVLNode.Data));
  359. AVLNode:=FAVLTree.FindSuccessor(AVLNode)
  360. end;
  361. {$endif}
  362. end;
  363. procedure TIndexedWordList.ForEach(Proc:TForEachProcedure;state:pointer);
  364. {$ifdef userb}
  365. var key : string;
  366. val:TIndexedWord;
  367. {$else}
  368. var
  369. AVLNode : TAVLTreeNode;
  370. {$endif}
  371. begin
  372. {$ifdef userb}
  373. if favltree.FirstNode(key,val) then
  374. begin // Scan it forward
  375. repeat
  376. proc(val,state);
  377. until not favltree.FindNext(key,val);
  378. end;
  379. {$else}
  380. AVLNode:=fAVLTree.FindLowest;
  381. while (AVLNode<>nil) do
  382. begin
  383. Proc(TIndexedWord(AVLNode.Data),State);
  384. AVLNode:=FAVLTree.FindSuccessor(AVLNode)
  385. end;
  386. {$endif}
  387. end;
  388. { TIndexedWord }
  389. function TIndexedWord.GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
  390. var
  391. i: Integer;
  392. begin
  393. Result := nil;
  394. if (FCachedTopic <> nil) and (FCachedTopic.FDocumentIndex = TopicIndexNum) then
  395. Exit(FCachedTopic);
  396. for i := 0 to High(FDocuments) do
  397. if FDocuments[i].FDocumentIndex = TopicIndexNum then
  398. Exit(FDocuments[i]);
  399. if Result = nil then
  400. begin
  401. Result := TIndexDocument.Create(TopicIndexNum);
  402. SetLength(FDocuments, Length(FDocuments)+1);
  403. FDocuments[High(FDocuments)] := Result;
  404. end;
  405. FCachedTopic := Result;
  406. end;
  407. function TIndexedWord.GetDocumentCount: Integer;
  408. begin
  409. Result := Length(FDocuments);
  410. end;
  411. constructor TIndexedWord.Create(AWord: String; AIsTitle: Boolean);
  412. begin
  413. FTheWord := AWord;
  414. FIsTitle := AIsTitle;
  415. end;
  416. destructor TIndexedWord.Destroy;
  417. var
  418. i: Integer;
  419. begin
  420. // here the word removed itself from the linked list. But it can't
  421. // touch the AVL tree here.
  422. for i := 0 to High(FDocuments) do
  423. FreeAndNil(FDocuments[i]);
  424. inherited Destroy;
  425. end;
  426. function TIndexedWord.GetLogicalDocument ( AIndex: Integer ) : TIndexDocument;
  427. begin
  428. Result := FDocuments[AIndex];;
  429. end;
  430. { TIndexDocument }
  431. procedure TIndexDocument.AddWordIndex ( AIndex: Integer ) ;
  432. begin
  433. if FLastEntry>=Length(WordIndex) Then
  434. SetLength(WordIndex, Length(WordIndex)+GrowSpeed);
  435. WordIndex[FLastEntry] := AIndex;
  436. Inc(FLastEntry);
  437. end;
  438. constructor TIndexDocument.Create ( ADocumentIndex: Integer ) ;
  439. begin
  440. FDocumentIndex := ADocumentIndex;
  441. flastentry:=0;
  442. end;
  443. function TIndexDocument.GetWordIndex(i:integer):integer;
  444. begin
  445. result:=WordIndex[i];
  446. end;
  447. function TIndexDocument.getindexentries:integer;
  448. begin
  449. result:=flastentry;
  450. end;
  451. end.