htmlindexer.pas 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479
  1. { Copyright (C) <2008> <Andrew Haines> htmlindexer.pas
  2. This library is free software; you can redistribute it and/or modify it
  3. under the terms of the GNU Library General Public License as published by
  4. the Free Software Foundation; either version 2 of the License, or (at your
  5. option) any later version.
  6. This program is distributed in the hope that it will be useful, but WITHOUT
  7. ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  8. FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
  9. for more details.
  10. You should have received a copy of the GNU Library General Public License
  11. along with this library; if not, write to the Free Software Foundation,
  12. Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  13. }
  14. {
  15. See the file COPYING.FPC, included in this distribution,
  16. for details about the copyright.
  17. }
  18. unit HTMLIndexer;
  19. {$MODE OBJFPC}{$H+}
  20. interface
  21. uses Classes, SysUtils, FastHTMLParser;
  22. Type
  23. { TIndexedWord }
  24. { TIndexDocument }
  25. TIndexDocument = class(TObject)
  26. private
  27. FDocumentIndex: Integer;
  28. public
  29. WordIndex: array of Integer;
  30. procedure AddWordIndex(AIndex: Integer);
  31. constructor Create(ADocumentIndex: Integer);
  32. property DocumentIndex: Integer read FDocumentIndex;
  33. end;
  34. TIndexedWord = class(TObject)
  35. private
  36. FIsTitle: Boolean;
  37. FNextWord: TIndexedWord;
  38. FPrevWord: TIndexedWord;
  39. FTheWord: string;
  40. FCachedTopic: TIndexDocument;
  41. FDocuments: Array of TIndexDocument;
  42. function GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
  43. function GetDocumentCount: Integer;
  44. public
  45. constructor Create(AWord: String; AIsTitle: Boolean);
  46. destructor Destroy; override;
  47. function GetLogicalDocument(AIndex: Integer): TIndexDocument;
  48. property TheWord: string read FTheWord; // Always lowercase
  49. property PrevWord: TIndexedWord read FPrevWord write FPrevWord;
  50. property NextWord: TIndexedWord read FNextWord write FNextWord;
  51. property DocumentTopic[TopicIndexNum: Integer]: TIndexDocument read GetDocument;
  52. property DocumentCount: Integer read GetDocumentCount;
  53. property IsTitle: Boolean read FIsTitle;
  54. end;
  55. { TIndexedWordList }
  56. TIndexedWordList = class(TObject)
  57. private
  58. FIndexTitlesOnly: Boolean;
  59. FIndexedFileCount: DWord;
  60. //vars while processing page
  61. FInTitle,
  62. FInBody: Boolean;
  63. FWordCount: Integer; // only words in body
  64. FDocTitle: String;
  65. FTopicIndex: Integer;
  66. //end vars
  67. FTotalDifferentWordLength: DWord;
  68. FTotalDIfferentWords: DWord;
  69. FTotalWordCount: DWord;
  70. FTotalWordLength: DWord;
  71. FLongestWord: DWord;
  72. FFirstWord: TIndexedWord;
  73. FCachedWord: TIndexedWord;
  74. FParser: THTMLParser;
  75. function AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
  76. function GetWordForward(AWord: String; StartWord: TIndexedWord; out WrongWord: TIndexedWord; AIsTitle: Boolean): TIndexedWord;
  77. function GetWordBackward(AWord: String; StartWord: TIndexedWord; out WrongWord: TIndexedWord; AIsTitle: Boolean): TIndexedWord;
  78. function CompareWord(AWord: String; AIndexWord: TIndexedWord; AIsTitle: Boolean): Integer;
  79. // callbacks
  80. procedure CBFoundTag(NoCaseTag, ActualTag: string);
  81. procedure CBFountText(Text: string);
  82. procedure EatWords(Words: String; IsTitle: Boolean);
  83. public
  84. constructor Create;
  85. destructor Destroy; override;
  86. function IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String; // returns the documents <Title>
  87. procedure Clear;
  88. procedure AddWord(const AWord: TIndexedWord; StartingWord: TIndexedWord; AIsTitle: Boolean);
  89. property FirstWord: TIndexedWord read FFirstWord;
  90. property IndexedFileCount: DWord read FIndexedFileCount;
  91. property LongestWord: DWord read FLongestWord;
  92. property TotalWordCount: DWord read FTotalWordCount;
  93. property TotalDIfferentWords: DWord read FTotalDIfferentWords;
  94. property TotalWordLength: DWord read FTotalWordLength;
  95. property TotalDifferentWordLength: DWord read FTotalDifferentWordLength;
  96. property Words[AWord: String; IsTitle: Boolean] : TIndexedWord read AddGetWord;
  97. end;
  98. implementation
  99. function Max(ANumber, BNumber: DWord): DWord;
  100. begin
  101. if ANumber > BNumber then
  102. Result := ANumber
  103. else
  104. Result := BNumber;
  105. end;
  106. { TIndexedWordList }
  107. function TIndexedWordList.AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
  108. var
  109. //StartWord,
  110. WrongWord: TIndexedWord;
  111. begin
  112. Result := nil;
  113. AWord := LowerCase(AWord);
  114. {if FCachedWord <> nil then
  115. StartWord := FCachedWord
  116. else
  117. StartWord := FFirstWord;
  118. if StartWord <> nil then
  119. begin
  120. case CompareWord(AWord, StartWord, IsTitle) of
  121. 0: Exit(WrongWord);
  122. 1: Result := GetWordBackward(AWord, StartWord, WrongWord, IsTitle);
  123. -1: Result := GetWordForward(AWord, StartWord, WrongWord, IsTitle);
  124. end;
  125. end
  126. else}
  127. Result := GetWordForward(AWord, FFirstWord, WrongWord, IsTitle);
  128. if Result = nil then
  129. begin
  130. Inc(FTotalDifferentWordLength, Length(AWord));
  131. Inc(FTotalDIfferentWords);
  132. Result := TIndexedWord.Create(AWord,IsTitle);
  133. AddWord(Result, WrongWord,IsTitle);
  134. if IsTitle then
  135. ;//WriteLn('Creating word: ', AWord);
  136. FLongestWord := Max(FLongestWord, Length(AWord));
  137. end;
  138. Inc(FTotalWordLength, Length(AWord));
  139. Inc(FTotalWordCount);
  140. end;
  141. function TIndexedWordList.GetWordForward(AWord: String; StartWord: TIndexedWord; out WrongWord: TIndexedWord; AIsTitle: Boolean): TIndexedWord;
  142. var
  143. FCurrentWord: TIndexedWord;
  144. begin
  145. Result := nil;
  146. WrongWord := nil;
  147. FCurrentWord := StartWord;
  148. while (FCurrentWord <> nil) and (CompareWord(AWord, FCurrentWord, AIsTitle) <> 0) do
  149. begin
  150. WrongWord := FCurrentWord;
  151. case CompareWord(AWord, FCurrentWord, AIsTitle) of
  152. -1: FCurrentWord := nil;
  153. 0: Exit(FCurrentWord);
  154. 1: FCurrentWord := FCurrentWord.NextWord;
  155. end;
  156. end;
  157. if FCurrentWord <> nil then
  158. Result := FCurrentWord;
  159. end;
  160. function TIndexedWordList.GetWordBackward(AWord: String; StartWord: TIndexedWord; out WrongWord: TIndexedWord; AIsTitle: Boolean): TIndexedWord;
  161. var
  162. FCurrentWord: TIndexedWord;
  163. begin
  164. Result := nil;
  165. WrongWord := nil;
  166. FCurrentWord := StartWord;
  167. while (FCurrentWord <> nil) and (CompareWord(AWord, FCurrentWord, AIsTitle) <> 0) do
  168. begin
  169. WrongWord := FCurrentWord;
  170. case CompareWord(AWord, FCurrentWord, AIsTitle) of
  171. -1:
  172. begin
  173. WrongWord := FCurrentWord;
  174. FCurrentWord := nil
  175. end;
  176. 0: Exit(FCurrentWord);
  177. 1: FCurrentWord := FCurrentWord.PrevWord;
  178. end;
  179. end;
  180. if FCurrentWord <> nil then
  181. Result := FCurrentWord;
  182. end;
  183. function TIndexedWordList.CompareWord ( AWord: String;
  184. AIndexWord: TIndexedWord; AIsTitle: Boolean ) : Integer;
  185. begin
  186. Result := CompareText(AWord, AIndexWord.TheWord);
  187. if Result = 0 then
  188. begin
  189. Result := Result + ord(AIndexWord.IsTitle);
  190. Result := Result - ord(AIsTitle);
  191. end;
  192. if Result < 0 then Result := -1
  193. else if Result > 0 then Result := 1;
  194. //if AIsTitle then
  195. //WriteLn('Looking for title word :', AWord);
  196. //WriteLn(Result);
  197. end;
  198. procedure TIndexedWordList.CBFoundTag(NoCaseTag, ActualTag: string);
  199. begin
  200. if FInBody then begin
  201. if NoCaseTag = '</BODY>' then FInBody := False;
  202. end
  203. else begin
  204. //WriteLn('"',NoCaseTag,'"');
  205. if NoCaseTag = '<TITLE>' then FInTitle := True
  206. else if NoCaseTag = '</TITLE>' then FInTitle := False
  207. else if NoCaseTag = '<BODY>' then FInBody := True
  208. else
  209. end;
  210. if FInBody and FIndexTitlesOnly then FParser.Done := True;
  211. end;
  212. procedure TIndexedWordList.CBFountText(Text: string);
  213. begin
  214. if Length(Text) < 1 then
  215. Exit;
  216. EatWords(Text, FInTitle and not FInBody);
  217. end;
  218. procedure TIndexedWordList.EatWords ( Words: String; IsTitle: Boolean ) ;
  219. var
  220. WordPtr: PChar;
  221. WordStart: PChar;
  222. InWord: Boolean;
  223. IsNumberWord: Boolean;
  224. function IsEndOfWord: Boolean;
  225. begin
  226. Result := not (WordPtr^ in ['a'..'z', '0'..'9', #01, #$DE, #$FE]);
  227. if Result and IsNumberWord then
  228. Result := Result and (WordPtr[0] <> '.');
  229. if Result and InWord then
  230. Result := Result and (WordPtr[0] <> '''');
  231. ;
  232. end;
  233. var
  234. WordIndex: TIndexedWord;
  235. WordName: String;
  236. FPos: Integer;
  237. begin
  238. if IsTitle then
  239. FDocTitle := Words;
  240. Words := LowerCase(Words);
  241. WordStart := PChar(Words);
  242. WordPtr := WordStart;
  243. IsNumberWord := False;
  244. InWord := False;
  245. repeat
  246. if InWord and IsEndOfWord then
  247. begin
  248. WordName := Copy(WordStart, 0, (WordPtr-WordStart));
  249. FPos := Pos('''', WordName);
  250. while FPos > 0 do
  251. begin
  252. Delete(WordName, FPos, 1);
  253. FPos := Pos('''', WordName);
  254. end;
  255. WordIndex := Self.Words[WordName, IsTitle];
  256. InWord := False;
  257. //if IsNumberWord then WriteLn('Following is NUMBER WORD: "', (WordStart[0]),'"'); ;
  258. IsNumberWord := False;
  259. WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
  260. //WriteLn(FWordCount, ' "', WordName,'"');
  261. //if not IsTitle then
  262. Inc(FWordCount);
  263. end
  264. else if not InWord and not IsEndOfWord then
  265. begin
  266. InWord := True;
  267. WordStart := WordPtr;
  268. IsNumberWord := WordPtr^ in ['0'..'9'];
  269. //if IsNumberWord then WriteLn('Following is NUMBER WORD: "', WordPtr[0],'"'); ;
  270. end;
  271. Inc(WordPtr);
  272. until WordPtr^ = #0;
  273. if InWord then
  274. begin
  275. WordName := Copy(WordStart, 0, (WordPtr-WordStart));
  276. WordIndex := Self.Words[WordName, IsTitle];
  277. WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
  278. InWord := False;
  279. //if IsNumberWord then WriteLn('Following is NUMBER WORD: "', (WordStart[0]),'"'); ;
  280. IsNumberWord := False;
  281. //WriteLn(FWordCount, ' "', WordName,'"');
  282. if not IsTitle then
  283. Inc(FWordCount);
  284. end;
  285. end;
  286. constructor TIndexedWordList.Create;
  287. begin
  288. inherited;
  289. end;
  290. destructor TIndexedWordList.Destroy;
  291. begin
  292. Clear;
  293. inherited Destroy;
  294. end;
  295. function TIndexedWordList.IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String;
  296. var
  297. TheFile: String;
  298. begin
  299. FInBody := False;
  300. FInTitle:= False;
  301. FIndexTitlesOnly := AIndexOnlyTitles;
  302. FWordCount := 0;
  303. FTopicIndex := ATOPICIndex;
  304. FIndexedFileCount := FIndexedFileCount +1;
  305. SetLength(TheFile, AStream.Size+1);
  306. AStream.Position := 0;
  307. AStream.Read(TheFile[1], AStream.Size);
  308. TheFile[Length(TheFile)] := #0;
  309. FParser := THTMLParser.Create(@TheFile[1]);
  310. FParser.OnFoundTag := @CBFoundTag;
  311. FParser.OnFoundText := @CBFountText;
  312. FParser.Exec;
  313. FParser.Free;
  314. Result := FDocTitle;
  315. FDocTitle := '';
  316. FInBody := False;
  317. FInTitle:= False;
  318. FWordCount := 0;
  319. FTopicIndex := -1;
  320. AStream.Position := 0;
  321. end;
  322. procedure TIndexedWordList.Clear;
  323. var
  324. FCurrentWord: TIndexedWord;
  325. begin
  326. FCurrentWord := FFirstWord;
  327. while FCurrentWord <> nil do
  328. begin
  329. FFirstWord := FCurrentWord.NextWord;
  330. FCurrentWord.Free;
  331. FCurrentWord := FFirstWord;
  332. end;
  333. end;
  334. procedure TIndexedWordList.AddWord(const AWord: TIndexedWord; StartingWord: TIndexedWord; AIsTitle: Boolean);
  335. var
  336. WrongWord: TIndexedWord;
  337. begin
  338. if FFirstWord = nil then
  339. FFirstWord := AWord
  340. else begin
  341. if StartingWord <> nil then
  342. WrongWord := StartingWord;
  343. case CompareWord(AWord.TheWord, StartingWord, AIsTitle) of
  344. 1: GetWordForward(AWord.TheWord, StartingWord, WrongWord, AIsTitle);
  345. 0: ; // uh oh
  346. -1: GetWordBackward(AWord.TheWord, StartingWord, WrongWord, AIsTitle);
  347. end;
  348. if WrongWord = nil then
  349. WrongWord := FirstWord;
  350. case CompareWord(AWord.TheWord, WrongWord, AIsTitle) of
  351. -1:
  352. begin
  353. AWord.PrevWord := WrongWord.PrevWord;
  354. if AWord.PrevWord <> nil then
  355. AWord.PrevWord.NextWord := AWord;
  356. WrongWord.PrevWord := AWord;
  357. AWord.NextWord := WrongWord;
  358. end;
  359. 0: ;//WriteLn('Found word which shouldn''t happen'); // uh oh
  360. 1:
  361. begin
  362. AWord.PrevWord := WrongWord;
  363. AWord.NextWord := WrongWord.NextWord;
  364. WrongWord.NextWord := AWord;
  365. end;
  366. end;
  367. end;
  368. if AWord.PrevWord = nil then
  369. FFirstWord := AWord;
  370. FCachedWord := AWord;
  371. end;
  372. { TIndexedWord }
  373. function TIndexedWord.GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
  374. var
  375. i: Integer;
  376. begin
  377. Result := nil;
  378. if (FCachedTopic <> nil) and (FCachedTopic.FDocumentIndex = TopicIndexNum) then
  379. Exit(FCachedTopic);
  380. for i := 0 to High(FDocuments) do
  381. if FDocuments[i].FDocumentIndex = TopicIndexNum then
  382. Exit(FDocuments[i]);
  383. if Result = nil then
  384. begin
  385. Result := TIndexDocument.Create(TopicIndexNum);
  386. SetLength(FDocuments, Length(FDocuments)+1);
  387. FDocuments[High(FDocuments)] := Result;
  388. end;
  389. FCachedTopic := Result;
  390. end;
  391. function TIndexedWord.GetDocumentCount: Integer;
  392. begin
  393. Result := Length(FDocuments);
  394. end;
  395. constructor TIndexedWord.Create(AWord: String; AIsTitle: Boolean);
  396. begin
  397. FTheWord := AWord;
  398. FIsTitle := AIsTitle;
  399. end;
  400. destructor TIndexedWord.Destroy;
  401. var
  402. i: Integer;
  403. begin
  404. if FPrevWord <> nil then
  405. FPrevWord.NextWord := FNextWord;
  406. if FNextWord <> nil then
  407. FNextWord.PrevWord := FPrevWord;
  408. for i := 0 to High(FDocuments) do
  409. FreeAndNil(FDocuments[i]);
  410. inherited Destroy;
  411. end;
  412. function TIndexedWord.GetLogicalDocument ( AIndex: Integer ) : TIndexDocument;
  413. begin
  414. Result := FDocuments[AIndex];;
  415. end;
  416. { TIndexDocument }
  417. procedure TIndexDocument.AddWordIndex ( AIndex: Integer ) ;
  418. begin
  419. SetLength(WordIndex, Length(WordIndex)+1);
  420. WordIndex[High(WordIndex)] := AIndex;
  421. end;
  422. constructor TIndexDocument.Create ( ADocumentIndex: Integer ) ;
  423. begin
  424. FDocumentIndex := ADocumentIndex;
  425. end;
  426. end.