pascal
/
freepascal.compiler
mirror of https://gitlab.com/freepascal.org/fpc/source.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
							{
    This file is part of the Free Component Library (FCL)
    Copyright (c) 2012 by the Free Pascal development team

    HTML text reader
    
    See the file COPYING.FPC, included in this distribution,
    for details about the copyright.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

 **********************************************************************}
unit IReaderHTML;

{$mode objfpc}{$H+}

interface

uses
  FastHTMLParser, //, HTMLUtil,          // Fast Parser Functions
  Classes, fpIndexer;

type

  { TIReaderHTML }

  TIReaderHTML = class(TCustomFileReader)
  private
    sLine: UTF8String;
    StartPos: integer;
    Offset: integer;
    LinePos: integer;
    Tg, Tx: integer;
    FParser: THTMLParser; //our htmlparser class
    procedure OnTag(NoCaseTag, ActualTag: String);
    procedure OnText(Text: String);
  protected
    function GetToken: UTF8String; override;
    function AllowedToken(token: UTF8String): boolean; override;
  public
    procedure LoadFromStream(FileStream: TStream); override;
  end;

implementation

{ TIReaderHTML }

procedure TIReaderHTML.OnTag(NoCaseTag, ActualTag: String);
begin
end;

procedure TIReaderHTML.OnText(Text: String);
var
  token: UTF8String;
  s: TSearchWordData;
  i : Integer;
begin
  sLine := Text;
  LinePos := 1;
  Offset:=FParser.CurrentPos;
  token := GetToken;
  while token <> '' do
    begin
    if AllowedToken(token) then
      begin
      s.SearchWord := token;
      s.Position := Offset+StartPos;
      // Copy area around text.
      I:=StartPos-(MaxContextLen div 2);
      If I<1 then
        I:=1;
      s.Context := Copy(SLine,I,I+MaxContextLen);
      Add(s);
      end;
    token := GetToken;
    end;
end;

function TIReaderHTML.GetToken: UTF8String;
var
  s: UTF8String;
  c: UTF8String;
begin
  Result := '';

  if (sLine = '') or (LinePos >= Length(sLine)) then
    exit;

  c := sLine[LinePos];
  Inc(LinePos);

  if LinePos <= Length(sLine) then
  begin

    //eat all invalid characters
    while not (c[1] in ['a'..'z', 'A'..'Z', '0'..'9']) and (LinePos <= Length(sLine)) do
    begin
      c := sLine[LinePos];
      Inc(LinePos);
    end;

    if not (c[1] in ['a'..'z', 'A'..'Z', '0'..'9']) then
      s := ''
    else
      s := c;
    StartPos:=LinePos;
    if LinePos <= Length(sLine) then
    begin
      //now read all valid characters from stream and append
      c := sLine[LinePos];
      Inc(LinePos);
      while (c[1] in ['a'..'z', 'A'..'Z', '0'..'9']) and (LinePos <= Length(sLine)) do
      begin
        s := S + c;
        c := sLine[LinePos];
        Inc(LinePos);
      end;
    end;

    if not (c[1] in ['a'..'z', 'A'..'Z', '0'..'9']) then
      Result := LowerCase(s)
    else
      Result := LowerCase(s + c);
  end;
end;

function TIReaderHTML.AllowedToken(token: UTF8String): boolean;
begin
  Result := (Length(token) > 1) and
            (token <> 'nbsp') and (token <> 'quot') and (token <> 'apos') and
            (token <> 'amp') and (token <> 'lt') and (token <> 'gt');
end;

procedure TIReaderHTML.LoadFromStream(FileStream: TStream);
var
  S : TStringStream;

begin
  inherited LoadFromStream(FileStream);
  S:=TStringStream.Create('');
  try
    S.CopyFrom(FileStream,0);
    Tg := 0;
    Tx := 0;
    FParser := THTMLParser.Create(S.DataString);
    try
      FParser.OnFoundTag := @OnTag;
      FParser.OnFoundText := @OnText;
      FParser.Exec;
    finally
      FParser.Free;
     end;
  finally
    S.Free;
  end;
  if DetectLanguage then
    DoDetectLanguage;
end;

initialization
  FileHandlers.RegisterFileReader('HTML format', 'html;htm', TIReaderHTML);

end.