123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479 |
- { Copyright (C) <2008> <Andrew Haines> htmlindexer.pas
- This library is free software; you can redistribute it and/or modify it
- under the terms of the GNU Library General Public License as published by
- the Free Software Foundation; either version 2 of the License, or (at your
- option) any later version.
- This program is distributed in the hope that it will be useful, but WITHOUT
- ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
- for more details.
- You should have received a copy of the GNU Library General Public License
- along with this library; if not, write to the Free Software Foundation,
- Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- }
- {
- See the file COPYING.FPC, included in this distribution,
- for details about the copyright.
- }
- unit HTMLIndexer;
- {$MODE OBJFPC}{$H+}
- interface
- uses Classes, SysUtils, FastHTMLParser;
- Type
- { TIndexedWord }
- { TIndexDocument }
- TIndexDocument = class(TObject)
- private
- FDocumentIndex: Integer;
- public
- WordIndex: array of Integer;
- procedure AddWordIndex(AIndex: Integer);
- constructor Create(ADocumentIndex: Integer);
- property DocumentIndex: Integer read FDocumentIndex;
- end;
- TIndexedWord = class(TObject)
- private
- FIsTitle: Boolean;
- FNextWord: TIndexedWord;
- FPrevWord: TIndexedWord;
- FTheWord: string;
- FCachedTopic: TIndexDocument;
- FDocuments: Array of TIndexDocument;
- function GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
- function GetDocumentCount: Integer;
- public
- constructor Create(AWord: String; AIsTitle: Boolean);
- destructor Destroy; override;
- function GetLogicalDocument(AIndex: Integer): TIndexDocument;
- property TheWord: string read FTheWord; // Always lowercase
- property PrevWord: TIndexedWord read FPrevWord write FPrevWord;
- property NextWord: TIndexedWord read FNextWord write FNextWord;
- property DocumentTopic[TopicIndexNum: Integer]: TIndexDocument read GetDocument;
- property DocumentCount: Integer read GetDocumentCount;
- property IsTitle: Boolean read FIsTitle;
- end;
- { TIndexedWordList }
- TIndexedWordList = class(TObject)
- private
- FIndexTitlesOnly: Boolean;
- FIndexedFileCount: DWord;
- //vars while processing page
- FInTitle,
- FInBody: Boolean;
- FWordCount: Integer; // only words in body
- FDocTitle: String;
- FTopicIndex: Integer;
- //end vars
- FTotalDifferentWordLength: DWord;
- FTotalDIfferentWords: DWord;
- FTotalWordCount: DWord;
- FTotalWordLength: DWord;
- FLongestWord: DWord;
- FFirstWord: TIndexedWord;
- FCachedWord: TIndexedWord;
- FParser: THTMLParser;
- function AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
- function GetWordForward(AWord: String; StartWord: TIndexedWord; out WrongWord: TIndexedWord; AIsTitle: Boolean): TIndexedWord;
- function GetWordBackward(AWord: String; StartWord: TIndexedWord; out WrongWord: TIndexedWord; AIsTitle: Boolean): TIndexedWord;
- function CompareWord(AWord: String; AIndexWord: TIndexedWord; AIsTitle: Boolean): Integer;
- // callbacks
- procedure CBFoundTag(NoCaseTag, ActualTag: string);
- procedure CBFountText(Text: string);
- procedure EatWords(Words: String; IsTitle: Boolean);
- public
- constructor Create;
- destructor Destroy; override;
- function IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String; // returns the documents <Title>
- procedure Clear;
- procedure AddWord(const AWord: TIndexedWord; StartingWord: TIndexedWord; AIsTitle: Boolean);
- property FirstWord: TIndexedWord read FFirstWord;
- property IndexedFileCount: DWord read FIndexedFileCount;
- property LongestWord: DWord read FLongestWord;
- property TotalWordCount: DWord read FTotalWordCount;
- property TotalDIfferentWords: DWord read FTotalDIfferentWords;
- property TotalWordLength: DWord read FTotalWordLength;
- property TotalDifferentWordLength: DWord read FTotalDifferentWordLength;
- property Words[AWord: String; IsTitle: Boolean] : TIndexedWord read AddGetWord;
- end;
- implementation
- function Max(ANumber, BNumber: DWord): DWord;
- begin
- if ANumber > BNumber then
- Result := ANumber
- else
- Result := BNumber;
- end;
- { TIndexedWordList }
- function TIndexedWordList.AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
- var
- //StartWord,
- WrongWord: TIndexedWord;
- begin
- Result := nil;
- AWord := LowerCase(AWord);
- {if FCachedWord <> nil then
- StartWord := FCachedWord
- else
- StartWord := FFirstWord;
- if StartWord <> nil then
- begin
- case CompareWord(AWord, StartWord, IsTitle) of
- 0: Exit(WrongWord);
- 1: Result := GetWordBackward(AWord, StartWord, WrongWord, IsTitle);
- -1: Result := GetWordForward(AWord, StartWord, WrongWord, IsTitle);
- end;
- end
- else}
- Result := GetWordForward(AWord, FFirstWord, WrongWord, IsTitle);
- if Result = nil then
- begin
- Inc(FTotalDifferentWordLength, Length(AWord));
- Inc(FTotalDIfferentWords);
- Result := TIndexedWord.Create(AWord,IsTitle);
- AddWord(Result, WrongWord,IsTitle);
- if IsTitle then
- ;//WriteLn('Creating word: ', AWord);
- FLongestWord := Max(FLongestWord, Length(AWord));
- end;
- Inc(FTotalWordLength, Length(AWord));
- Inc(FTotalWordCount);
- end;
- function TIndexedWordList.GetWordForward(AWord: String; StartWord: TIndexedWord; out WrongWord: TIndexedWord; AIsTitle: Boolean): TIndexedWord;
- var
- FCurrentWord: TIndexedWord;
- begin
- Result := nil;
- WrongWord := nil;
- FCurrentWord := StartWord;
- while (FCurrentWord <> nil) and (CompareWord(AWord, FCurrentWord, AIsTitle) <> 0) do
- begin
- WrongWord := FCurrentWord;
- case CompareWord(AWord, FCurrentWord, AIsTitle) of
- -1: FCurrentWord := nil;
- 0: Exit(FCurrentWord);
- 1: FCurrentWord := FCurrentWord.NextWord;
- end;
- end;
- if FCurrentWord <> nil then
- Result := FCurrentWord;
- end;
- function TIndexedWordList.GetWordBackward(AWord: String; StartWord: TIndexedWord; out WrongWord: TIndexedWord; AIsTitle: Boolean): TIndexedWord;
- var
- FCurrentWord: TIndexedWord;
- begin
- Result := nil;
- WrongWord := nil;
- FCurrentWord := StartWord;
- while (FCurrentWord <> nil) and (CompareWord(AWord, FCurrentWord, AIsTitle) <> 0) do
- begin
- WrongWord := FCurrentWord;
- case CompareWord(AWord, FCurrentWord, AIsTitle) of
- -1:
- begin
- WrongWord := FCurrentWord;
- FCurrentWord := nil
- end;
- 0: Exit(FCurrentWord);
- 1: FCurrentWord := FCurrentWord.PrevWord;
- end;
- end;
- if FCurrentWord <> nil then
- Result := FCurrentWord;
- end;
- function TIndexedWordList.CompareWord ( AWord: String;
- AIndexWord: TIndexedWord; AIsTitle: Boolean ) : Integer;
- begin
- Result := CompareText(AWord, AIndexWord.TheWord);
- if Result = 0 then
- begin
- Result := Result + ord(AIndexWord.IsTitle);
- Result := Result - ord(AIsTitle);
- end;
- if Result < 0 then Result := -1
- else if Result > 0 then Result := 1;
- //if AIsTitle then
- //WriteLn('Looking for title word :', AWord);
- //WriteLn(Result);
- end;
- procedure TIndexedWordList.CBFoundTag(NoCaseTag, ActualTag: string);
- begin
- if FInBody then begin
- if NoCaseTag = '</BODY>' then FInBody := False;
- end
- else begin
- //WriteLn('"',NoCaseTag,'"');
- if NoCaseTag = '<TITLE>' then FInTitle := True
- else if NoCaseTag = '</TITLE>' then FInTitle := False
- else if NoCaseTag = '<BODY>' then FInBody := True
- else
- end;
- if FInBody and FIndexTitlesOnly then FParser.Done := True;
- end;
- procedure TIndexedWordList.CBFountText(Text: string);
- begin
- if Length(Text) < 1 then
- Exit;
- EatWords(Text, FInTitle and not FInBody);
- end;
- procedure TIndexedWordList.EatWords ( Words: String; IsTitle: Boolean ) ;
- var
- WordPtr: PChar;
- WordStart: PChar;
- InWord: Boolean;
- IsNumberWord: Boolean;
- function IsEndOfWord: Boolean;
- begin
- Result := not (WordPtr^ in ['a'..'z', '0'..'9', #01, #$DE, #$FE]);
- if Result and IsNumberWord then
- Result := Result and (WordPtr[0] <> '.');
- if Result and InWord then
- Result := Result and (WordPtr[0] <> '''');
- ;
- end;
- var
- WordIndex: TIndexedWord;
- WordName: String;
- FPos: Integer;
- begin
- if IsTitle then
- FDocTitle := Words;
- Words := LowerCase(Words);
- WordStart := PChar(Words);
- WordPtr := WordStart;
- IsNumberWord := False;
- InWord := False;
- repeat
- if InWord and IsEndOfWord then
- begin
- WordName := Copy(WordStart, 0, (WordPtr-WordStart));
- FPos := Pos('''', WordName);
- while FPos > 0 do
- begin
- Delete(WordName, FPos, 1);
- FPos := Pos('''', WordName);
- end;
- WordIndex := Self.Words[WordName, IsTitle];
- InWord := False;
- //if IsNumberWord then WriteLn('Following is NUMBER WORD: "', (WordStart[0]),'"'); ;
- IsNumberWord := False;
- WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
- //WriteLn(FWordCount, ' "', WordName,'"');
- //if not IsTitle then
- Inc(FWordCount);
- end
- else if not InWord and not IsEndOfWord then
- begin
- InWord := True;
- WordStart := WordPtr;
- IsNumberWord := WordPtr^ in ['0'..'9'];
- //if IsNumberWord then WriteLn('Following is NUMBER WORD: "', WordPtr[0],'"'); ;
- end;
- Inc(WordPtr);
- until WordPtr^ = #0;
- if InWord then
- begin
- WordName := Copy(WordStart, 0, (WordPtr-WordStart));
- WordIndex := Self.Words[WordName, IsTitle];
- WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
- InWord := False;
- //if IsNumberWord then WriteLn('Following is NUMBER WORD: "', (WordStart[0]),'"'); ;
- IsNumberWord := False;
- //WriteLn(FWordCount, ' "', WordName,'"');
- if not IsTitle then
- Inc(FWordCount);
- end;
- end;
- constructor TIndexedWordList.Create;
- begin
- inherited;
- end;
- destructor TIndexedWordList.Destroy;
- begin
- Clear;
- inherited Destroy;
- end;
- function TIndexedWordList.IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String;
- var
- TheFile: String;
- begin
- FInBody := False;
- FInTitle:= False;
- FIndexTitlesOnly := AIndexOnlyTitles;
- FWordCount := 0;
- FTopicIndex := ATOPICIndex;
- FIndexedFileCount := FIndexedFileCount +1;
- SetLength(TheFile, AStream.Size+1);
- AStream.Position := 0;
- AStream.Read(TheFile[1], AStream.Size);
- TheFile[Length(TheFile)] := #0;
- FParser := THTMLParser.Create(@TheFile[1]);
- FParser.OnFoundTag := @CBFoundTag;
- FParser.OnFoundText := @CBFountText;
- FParser.Exec;
- FParser.Free;
- Result := FDocTitle;
- FDocTitle := '';
- FInBody := False;
- FInTitle:= False;
- FWordCount := 0;
- FTopicIndex := -1;
- AStream.Position := 0;
- end;
- procedure TIndexedWordList.Clear;
- var
- FCurrentWord: TIndexedWord;
- begin
- FCurrentWord := FFirstWord;
- while FCurrentWord <> nil do
- begin
- FFirstWord := FCurrentWord.NextWord;
- FCurrentWord.Free;
- FCurrentWord := FFirstWord;
- end;
- end;
- procedure TIndexedWordList.AddWord(const AWord: TIndexedWord; StartingWord: TIndexedWord; AIsTitle: Boolean);
- var
- WrongWord: TIndexedWord;
- begin
- if FFirstWord = nil then
- FFirstWord := AWord
- else begin
- if StartingWord <> nil then
- WrongWord := StartingWord;
- case CompareWord(AWord.TheWord, StartingWord, AIsTitle) of
- 1: GetWordForward(AWord.TheWord, StartingWord, WrongWord, AIsTitle);
- 0: ; // uh oh
- -1: GetWordBackward(AWord.TheWord, StartingWord, WrongWord, AIsTitle);
- end;
- if WrongWord = nil then
- WrongWord := FirstWord;
- case CompareWord(AWord.TheWord, WrongWord, AIsTitle) of
- -1:
- begin
- AWord.PrevWord := WrongWord.PrevWord;
- if AWord.PrevWord <> nil then
- AWord.PrevWord.NextWord := AWord;
- WrongWord.PrevWord := AWord;
- AWord.NextWord := WrongWord;
- end;
- 0: ;//WriteLn('Found word which shouldn''t happen'); // uh oh
- 1:
- begin
- AWord.PrevWord := WrongWord;
- AWord.NextWord := WrongWord.NextWord;
- WrongWord.NextWord := AWord;
- end;
- end;
- end;
- if AWord.PrevWord = nil then
- FFirstWord := AWord;
- FCachedWord := AWord;
- end;
- { TIndexedWord }
- function TIndexedWord.GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
- var
- i: Integer;
- begin
- Result := nil;
- if (FCachedTopic <> nil) and (FCachedTopic.FDocumentIndex = TopicIndexNum) then
- Exit(FCachedTopic);
- for i := 0 to High(FDocuments) do
- if FDocuments[i].FDocumentIndex = TopicIndexNum then
- Exit(FDocuments[i]);
- if Result = nil then
- begin
- Result := TIndexDocument.Create(TopicIndexNum);
- SetLength(FDocuments, Length(FDocuments)+1);
- FDocuments[High(FDocuments)] := Result;
- end;
- FCachedTopic := Result;
- end;
- function TIndexedWord.GetDocumentCount: Integer;
- begin
- Result := Length(FDocuments);
- end;
- constructor TIndexedWord.Create(AWord: String; AIsTitle: Boolean);
- begin
- FTheWord := AWord;
- FIsTitle := AIsTitle;
- end;
- destructor TIndexedWord.Destroy;
- var
- i: Integer;
- begin
- if FPrevWord <> nil then
- FPrevWord.NextWord := FNextWord;
- if FNextWord <> nil then
- FNextWord.PrevWord := FPrevWord;
- for i := 0 to High(FDocuments) do
- FreeAndNil(FDocuments[i]);
- inherited Destroy;
- end;
- function TIndexedWord.GetLogicalDocument ( AIndex: Integer ) : TIndexDocument;
- begin
- Result := FDocuments[AIndex];;
- end;
- { TIndexDocument }
- procedure TIndexDocument.AddWordIndex ( AIndex: Integer ) ;
- begin
- SetLength(WordIndex, Length(WordIndex)+1);
- WordIndex[High(WordIndex)] := AIndex;
- end;
- constructor TIndexDocument.Create ( ADocumentIndex: Integer ) ;
- begin
- FDocumentIndex := ADocumentIndex;
- end;
- end.
|