| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507 |
- { Copyright (C) <2008> <Andrew Haines> htmlindexer.pas
- This library is free software; you can redistribute it and/or modify it
- under the terms of the GNU Library General Public License as published by
- the Free Software Foundation; either version 2 of the License, or (at your
- option) any later version.
- This program is distributed in the hope that it will be useful, but WITHOUT
- ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License
- for more details.
- You should have received a copy of the GNU Library General Public License
- along with this library; if not, write to the Free Software Foundation,
- Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- }
- {
- See the file COPYING.FPC, included in this distribution,
- for details about the copyright.
- }
- unit HTMLIndexer;
- {$MODE OBJFPC}{$H+}
- interface
- uses Classes, SysUtils, FastHTMLParser,{$ifdef userb}fos_redblacktree_gen{$else}avl_tree{$endif};
- Type
- { TIndexDocument }
- TIndexDocument = class(TObject)
- private
- FDocumentIndex: Integer;
- FLastEntry : Integer;
- WordIndex: array of Integer;
- function getindexentries:integer;
- public
- function GetWordIndex(i:integer):integer; inline;
- procedure AddWordIndex(AIndex: Integer);
- constructor Create(ADocumentIndex: Integer);
- property DocumentIndex: Integer read FDocumentIndex;
- property IndexEntry[i:integer] : Integer read GetWordIndex;
- property NumberofIndexEntries : integer read getindexentries;
- end;
- { TIndexedWord }
- TIndexedWord = class(TObject)
- private
- FIsTitle: Boolean;
- FTheWord: string;
- FCachedTopic: TIndexDocument;
- FDocuments: Array of TIndexDocument;
- function GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
- function GetDocumentCount: Integer;
- public
- constructor Create(AWord: String; AIsTitle: Boolean);
- destructor Destroy; override;
- function GetLogicalDocument(AIndex: Integer): TIndexDocument;
- property TheWord: string read FTheWord write ftheword; // Always lowercase
- property DocumentTopic[TopicIndexNum: Integer]: TIndexDocument read GetDocument;
- property DocumentCount: Integer read GetDocumentCount;
- property IsTitle: Boolean read FIsTitle write fistitle;
- end;
- { TIndexedWordList }
- {$ifdef userb}
- TRBIndexTree = specialize TGFOS_RBTree<String,TIndexedWord>;
- {$endif}
- TForEachMethod = procedure (AWord:TIndexedWord) of object;
- TForEachProcedure = Procedure (AWord:TIndexedWord;state:pointer);
- TIndexedWordList = class(TObject)
- private
- FIndexTitlesOnly: Boolean;
- FIndexedFileCount: DWord;
- //vars while processing page
- FInTitle,
- FInBody: Boolean;
- FWordCount: Integer; // only words in body
- FDocTitle: String;
- FTopicIndex: Integer;
- //end vars
- FTotalDifferentWordLength: DWord;
- FTotalDIfferentWords: DWord;
- FTotalWordCount: DWord;
- FTotalWordLength: DWord;
- FLongestWord: DWord;
- FParser: THTMLParser;
- {$ifdef userb}
- FAVLTree : TRBIndexTree;
- {$else}
- FAVLTree : TAVLTree;
- Spare :TIndexedWord;
- {$endif}
- function AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
- // callbacks
- procedure CBFoundTag(NoCaseTag, ActualTag: string);
- procedure CBFountText(Text: string);
- procedure EatWords(Words: String; IsTitle: Boolean);
- public
- constructor Create;
- destructor Destroy; override;
- function IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String; // returns the documents <Title>
- procedure Clear;
- procedure AddWord(const AWord: TIndexedWord);
- procedure ForEach(Proc:TForEachMethod);
- procedure ForEach(Proc:TForEachProcedure;state:pointer);
- property IndexedFileCount: DWord read FIndexedFileCount;
- property LongestWord: DWord read FLongestWord;
- property TotalWordCount: DWord read FTotalWordCount;
- property TotalDIfferentWords: DWord read FTotalDIfferentWords;
- property TotalWordLength: DWord read FTotalWordLength;
- property TotalDifferentWordLength: DWord read FTotalDifferentWordLength;
- property Words[AWord: String; IsTitle: Boolean] : TIndexedWord read AddGetWord;
- end;
- implementation
- Const GrowSpeed = 10;
- function Max(ANumber, BNumber: DWord): DWord;
- begin
- if ANumber > BNumber then
- Result := ANumber
- else
- Result := BNumber;
- end;
- const titlexlat : array [boolean] of char = ('0','1');
- function makekey( n : string;istitle:boolean):string; inline;
- begin
- result:=n+'___'+titlexlat[istitle];
- end;
- Function CompareProcObj(Node1, Node2: Pointer): integer;
- var n1,n2 : TIndexedWord;
- begin
- n1:=TIndexedWord(Node1); n2:=TIndexedWord(Node2);
- Result := CompareText(n1.theword, n2.theword);
- if Result = 0 then
- begin
- Result := ord(n2.IsTitle)-ord(n1.IsTitle);
- end;
- if Result < 0 then Result := -1
- else if Result > 0 then Result := 1;
- end;
- { TIndexedWordList }
- function TIndexedWordList.AddGetWord(AWord: String; IsTitle: Boolean): TIndexedWord;
- var
- {$ifdef userb}
- key : string;
- {$else}
- n : TAVLTreeNode;
- {$endif}
- begin
- Result := nil;
- AWord := LowerCase(AWord);
- {$ifdef userb}
- key:=makekey(aword,istitle);
- if not favltree.Find(key,result) then result:=nil;;
- {$else}
- if not assigned(spare) then
- spare:=TIndexedWord.Create(AWord,IsTitle)
- else
- begin
- spare.TheWord:=aword;
- spare.IsTitle:=IsTitle;
- end;
- n:=favltree.FindKey(Spare,@CompareProcObj);
- if assigned(n) then
- result:=TIndexedWord(n.Data);
- {$endif}
-
- if Result = nil then
- begin
- Inc(FTotalDifferentWordLength, Length(AWord));
- Inc(FTotalDIfferentWords);
- {$ifdef userb}
- result:=TIndexedWord.Create(AWord,IsTitle);
- favltree.add(key,result);
- {$else}
- Result := spare; // TIndexedWord.Create(AWord,IsTitle);
- spare:=nil;
- AddWord(Result);
- {$endif}
- // if IsTitle then
- //WriteLn('Creating word: ', AWord);
- FLongestWord := Max(FLongestWord, Length(AWord));
- end;
- Inc(FTotalWordLength, Length(AWord));
- Inc(FTotalWordCount);
- end;
- procedure TIndexedWordList.CBFoundTag(NoCaseTag, ActualTag: string);
- begin
- if FInBody then begin
- if NoCaseTag = '</BODY>' then FInBody := False;
- end
- else begin
- //WriteLn('"',NoCaseTag,'"');
- if NoCaseTag = '<TITLE>' then FInTitle := True
- else if NoCaseTag = '</TITLE>' then FInTitle := False
- else if NoCaseTag = '<BODY>' then FInBody := True
- else
- end;
- if FInBody and FIndexTitlesOnly then FParser.Done := True;
- end;
- procedure TIndexedWordList.CBFountText(Text: string);
- begin
- if Length(Text) < 1 then
- Exit;
- if (not FInTitle) and (not FInBody) then
- Exit;
- EatWords(Text, FInTitle and not FInBody);
- end;
- procedure TIndexedWordList.EatWords ( Words: String; IsTitle: Boolean ) ;
- var
- WordPtr: PChar;
- WordStart: PChar;
- InWord: Boolean;
- IsNumberWord: Boolean;
- function IsEndOfWord: Boolean;
- begin
- Result := not (WordPtr^ in ['a'..'z', '0'..'9', #01, #$DE, #$FE]);
- if Result and IsNumberWord then
- Result := Result and (WordPtr[0] <> '.');
- if Result and InWord then
- Result := Result and (WordPtr[0] <> '''');
- ;
- end;
- var
- WordIndex: TIndexedWord;
- WordName: String;
- FPos: Integer;
- begin
- if IsTitle then
- FDocTitle := Words;
- Words := LowerCase(Words);
- WordStart := PChar(Words);
- WordPtr := WordStart;
- IsNumberWord := False;
- InWord := False;
- repeat
- if InWord and IsEndOfWord then
- begin
- WordName := Copy(WordStart, 0, (WordPtr-WordStart));
- FPos := Pos('''', WordName);
- while FPos > 0 do
- begin
- Delete(WordName, FPos, 1);
- FPos := Pos('''', WordName);
- end;
- WordIndex := addgetword(wordname,istitle);
- InWord := False;
- IsNumberWord := False;
- WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
- //if not IsTitle then
- Inc(FWordCount);
- end
- else if not InWord and not IsEndOfWord then
- begin
- InWord := True;
- WordStart := WordPtr;
- IsNumberWord := WordPtr^ in ['0'..'9'];
- end;
- Inc(WordPtr);
- until WordPtr^ = #0;
- if InWord then
- begin
- WordName := Copy(WordStart, 0, (WordPtr-WordStart));
- try
- WordIndex := addgetword(wordname,istitle); // Self.Words[WordName, IsTitle];
- except on e:exception do writeln('Error: ', wordname); end;
- WordIndex.DocumentTopic[FTopicIndex].AddWordIndex(FWordCount);
- InWord := False;
- //if IsNumberWord then WriteLn('Following is NUMBER WORD: "', (WordStart[0]),'"'); ;
- IsNumberWord := False;
- //WriteLn(FWordCount, ' "', WordName,'"');
- if not IsTitle then
- Inc(FWordCount);
- end;
- end;
- function defaultindexedword : TIndexedWord;
- begin
- result:=Tindexedword.create('',false);
- end;
- constructor TIndexedWordList.Create;
- begin
- inherited;
- {$ifdef userb}
- FAVLTree :=TRBIndexTree.create(@default_rb_string_compare,
- @defaultindexedword,
- @default_rb_string_undef );
- {$else}
- favltree:=TAVLTree.Create(@CompareProcObj);
- spare:=nil;
- {$endif}
- end;
- procedure FreeObject(const Obj:TIndexedWord);
- begin
- obj.free;
- end;
-
- destructor TIndexedWordList.Destroy;
- begin
- clear;
- {$ifndef userb}
- if assigned(spare) then spare.free;
- {$endif}
- favltree.free;
- inherited Destroy;
- end;
- function TIndexedWordList.IndexFile(AStream: TStream; ATOPICIndex: Integer; AIndexOnlyTitles: Boolean): String;
- var
- TheFile: String;
- begin
- FInBody := False;
- FInTitle:= False;
- FIndexTitlesOnly := AIndexOnlyTitles;
- FWordCount := 0;
- FTopicIndex := ATOPICIndex;
- FIndexedFileCount := FIndexedFileCount +1;
- SetLength(TheFile, AStream.Size+1);
- AStream.Position := 0;
- AStream.Read(TheFile[1], AStream.Size);
- TheFile[Length(TheFile)] := #0;
- FParser := THTMLParser.Create(@TheFile[1]);
- FParser.OnFoundTag := @CBFoundTag;
- FParser.OnFoundText := @CBFountText;
- FParser.Exec;
- FParser.Free;
- Result := FDocTitle;
- FDocTitle := '';
- FInBody := False;
- FInTitle:= False;
- FWordCount := 0;
- FTopicIndex := -1;
- AStream.Position := 0;
- end;
- procedure TIndexedWordList.Clear;
- begin
- {$ifdef userb}
- fAvlTree.ClearN(@FreeObject);
- {$else}
- fAvlTree.FreeAndClear;
- {$endif}
- end;
- procedure TIndexedWordList.AddWord(const AWord: TIndexedWord);
- begin
- {$ifdef userb}
- favltree.add(makekey(aword.theword,aword.istitle),AWord);
- {$else}
- favltree.add(aword);
- {$endif}
- end;
- procedure TIndexedWordList.ForEach(Proc:TForEachMethod);
- {$ifdef userb}
- var key : string;
- val:TIndexedWord;
- {$else}
- var
- AVLNode : TAVLTreeNode;
- {$endif}
- begin
- {$ifdef userb}
- if favltree.FirstNode(key,val) then
- begin // Scan it forward
- repeat
- proc(val);
- until not favltree.FindNext(key,val);
- end;
- {$else}
- AVLNode:=fAVLTree.FindLowest;
- while (AVLNode<>nil) do
- begin
- Proc(TIndexedWord(AVLNode.Data));
- AVLNode:=FAVLTree.FindSuccessor(AVLNode)
- end;
- {$endif}
- end;
- procedure TIndexedWordList.ForEach(Proc:TForEachProcedure;state:pointer);
- {$ifdef userb}
- var key : string;
- val:TIndexedWord;
- {$else}
- var
- AVLNode : TAVLTreeNode;
- {$endif}
- begin
- {$ifdef userb}
- if favltree.FirstNode(key,val) then
- begin // Scan it forward
- repeat
- proc(val,state);
- until not favltree.FindNext(key,val);
- end;
- {$else}
- AVLNode:=fAVLTree.FindLowest;
- while (AVLNode<>nil) do
- begin
- Proc(TIndexedWord(AVLNode.Data),State);
- AVLNode:=FAVLTree.FindSuccessor(AVLNode)
- end;
- {$endif}
- end;
- { TIndexedWord }
- function TIndexedWord.GetDocument ( TopicIndexNum: Integer ) : TIndexDocument;
- var
- i: Integer;
- begin
- Result := nil;
- if (FCachedTopic <> nil) and (FCachedTopic.FDocumentIndex = TopicIndexNum) then
- Exit(FCachedTopic);
- for i := 0 to High(FDocuments) do
- if FDocuments[i].FDocumentIndex = TopicIndexNum then
- Exit(FDocuments[i]);
- if Result = nil then
- begin
- Result := TIndexDocument.Create(TopicIndexNum);
- SetLength(FDocuments, Length(FDocuments)+1);
- FDocuments[High(FDocuments)] := Result;
- end;
- FCachedTopic := Result;
- end;
- function TIndexedWord.GetDocumentCount: Integer;
- begin
- Result := Length(FDocuments);
- end;
- constructor TIndexedWord.Create(AWord: String; AIsTitle: Boolean);
- begin
- FTheWord := AWord;
- FIsTitle := AIsTitle;
- end;
- destructor TIndexedWord.Destroy;
- var
- i: Integer;
- begin
- // here the word removed itself from the linked list. But it can't
- // touch the AVL tree here.
- for i := 0 to High(FDocuments) do
- FreeAndNil(FDocuments[i]);
- inherited Destroy;
- end;
- function TIndexedWord.GetLogicalDocument ( AIndex: Integer ) : TIndexDocument;
- begin
- Result := FDocuments[AIndex];;
- end;
- { TIndexDocument }
- procedure TIndexDocument.AddWordIndex ( AIndex: Integer ) ;
- begin
- if FLastEntry>=Length(WordIndex) Then
- SetLength(WordIndex, Length(WordIndex)+GrowSpeed);
- WordIndex[FLastEntry] := AIndex;
- Inc(FLastEntry);
- end;
- constructor TIndexDocument.Create ( ADocumentIndex: Integer ) ;
- begin
- FDocumentIndex := ADocumentIndex;
- flastentry:=0;
- end;
- function TIndexDocument.GetWordIndex(i:integer):integer;
- begin
- result:=WordIndex[i];
- end;
- function TIndexDocument.getindexentries:integer;
- begin
- result:=flastentry;
- end;
- end.
|