ireaderhtml.pp 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. {
  2. This file is part of the Free Component Library (FCL)
  3. Copyright (c) 2012 by the Free Pascal development team
  4. HTML text reader
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. unit IReaderHTML;
  12. {$mode objfpc}{$H+}
  13. interface
  14. uses
  15. FastHTMLParser, //, HTMLUtil, // Fast Parser Functions
  16. Classes, fpIndexer;
  17. type
  18. { TIReaderHTML }
  19. TIReaderHTML = class(TCustomFileReader)
  20. private
  21. sLine: UTF8String;
  22. StartPos: integer;
  23. Offset: integer;
  24. LinePos: integer;
  25. Tg, Tx: integer;
  26. FParser: THTMLParser; //our htmlparser class
  27. procedure OnTag(NoCaseTag, ActualTag: String);
  28. procedure OnText(Text: String);
  29. protected
  30. function GetToken: UTF8String; override;
  31. function AllowedToken(token: UTF8String): boolean; override;
  32. public
  33. procedure LoadFromStream(FileStream: TStream); override;
  34. end;
  35. implementation
  36. { TIReaderHTML }
  37. procedure TIReaderHTML.OnTag(NoCaseTag, ActualTag: String);
  38. begin
  39. end;
  40. procedure TIReaderHTML.OnText(Text: String);
  41. var
  42. token: UTF8String;
  43. s: TSearchWordData;
  44. i : Integer;
  45. begin
  46. sLine := Text;
  47. LinePos := 1;
  48. Offset:=FParser.CurrentPos;
  49. token := GetToken;
  50. while token <> '' do
  51. begin
  52. if AllowedToken(token) then
  53. begin
  54. s.SearchWord := token;
  55. s.Position := Offset+StartPos;
  56. // Copy area around text.
  57. I:=StartPos-(MaxContextLen div 2);
  58. If I<1 then
  59. I:=1;
  60. s.Context := Copy(SLine,I,I+MaxContextLen);
  61. Add(s);
  62. end;
  63. token := GetToken;
  64. end;
  65. end;
  66. function TIReaderHTML.GetToken: UTF8String;
  67. var
  68. s: UTF8String;
  69. c: UTF8String;
  70. begin
  71. Result := '';
  72. if (sLine = '') or (LinePos >= Length(sLine)) then
  73. exit;
  74. c := sLine[LinePos];
  75. Inc(LinePos);
  76. if LinePos <= Length(sLine) then
  77. begin
  78. //eat all invalid characters
  79. while not (c[1] in ['a'..'z', 'A'..'Z', '0'..'9']) and (LinePos <= Length(sLine)) do
  80. begin
  81. c := sLine[LinePos];
  82. Inc(LinePos);
  83. end;
  84. if not (c[1] in ['a'..'z', 'A'..'Z', '0'..'9']) then
  85. s := ''
  86. else
  87. s := c;
  88. StartPos:=LinePos;
  89. if LinePos <= Length(sLine) then
  90. begin
  91. //now read all valid characters from stream and append
  92. c := sLine[LinePos];
  93. Inc(LinePos);
  94. while (c[1] in ['a'..'z', 'A'..'Z', '0'..'9']) and (LinePos <= Length(sLine)) do
  95. begin
  96. s := S + c;
  97. c := sLine[LinePos];
  98. Inc(LinePos);
  99. end;
  100. end;
  101. if not (c[1] in ['a'..'z', 'A'..'Z', '0'..'9']) then
  102. Result := LowerCase(s)
  103. else
  104. Result := LowerCase(s + c);
  105. end;
  106. end;
  107. function TIReaderHTML.AllowedToken(token: UTF8String): boolean;
  108. begin
  109. Result := (Length(token) > 1) and
  110. (token <> 'nbsp') and (token <> 'quot') and (token <> 'apos') and
  111. (token <> 'amp') and (token <> 'lt') and (token <> 'gt');
  112. end;
  113. procedure TIReaderHTML.LoadFromStream(FileStream: TStream);
  114. var
  115. S : TStringStream;
  116. begin
  117. inherited LoadFromStream(FileStream);
  118. S:=TStringStream.Create('');
  119. try
  120. S.CopyFrom(FileStream,0);
  121. Tg := 0;
  122. Tx := 0;
  123. FParser := THTMLParser.Create(S.DataString);
  124. try
  125. FParser.OnFoundTag := @OnTag;
  126. FParser.OnFoundText := @OnText;
  127. FParser.Exec;
  128. finally
  129. FParser.Free;
  130. end;
  131. finally
  132. S.Free;
  133. end;
  134. if DetectLanguage then
  135. DoDetectLanguage;
  136. end;
  137. initialization
  138. FileHandlers.RegisterFileReader('HTML format', 'html;htm', TIReaderHTML);
  139. end.