whtmlscn.pas 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. {
  2. $Id$
  3. This file is part of the Free Pascal Integrated Development Environment
  4. Copyright (c) 2000 by Berczi Gabor
  5. HTML scanner objects
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. unit WHTMLScn;
  13. interface
  14. uses Objects,
  15. WHTML;
  16. type
  17. TCustomHTMLLinkScanner = object(THTMLParser)
  18. function DocAddTextChar(C: char): boolean; virtual;
  19. procedure DocAnchor(Entered: boolean); virtual;
  20. public
  21. {a}function CheckURL(const URL: string): boolean; virtual;
  22. {a}function CheckText(const Text: string): boolean; virtual;
  23. {a}procedure AddLink(const LinkText, LinkURL: string); virtual;
  24. {a}function GetDocumentBaseURL: string; virtual;
  25. private
  26. CurLinkText: string;
  27. CurURL: string;
  28. InAnchor: boolean;
  29. end;
  30. PHTMLLinkScanDocument = ^THTMLLinkScanDocument;
  31. THTMLLinkScanDocument = object(TObject)
  32. constructor Init(const ADocName: string);
  33. function GetName: string;
  34. function GetAliasCount: sw_integer;
  35. function GetAlias(Index: sw_integer): string;
  36. procedure AddAlias(const Alias: string);
  37. constructor Load(var S: TStream);
  38. procedure Store(var S: TStream);
  39. destructor Done; virtual;
  40. private
  41. DocName: PString;
  42. Aliases: PStringCollection;
  43. end;
  44. PHTMLLinkScanDocumentCollection = ^THTMLLinkScanDocumentCollection;
  45. THTMLLinkScanDocumentCollection = object(TSortedCollection)
  46. function Compare(Key1, Key2: Pointer): sw_Integer; virtual;
  47. function At(Index: sw_Integer): PHTMLLinkScanDocument;
  48. function SearchDocument(const DocName: string): PHTMLLinkScanDocument;
  49. end;
  50. THTMLLinkScanner = object(TCustomHTMLLinkScanner)
  51. constructor Init;
  52. function GetDocumentCount: sw_integer;
  53. function GetDocumentURL(DocIndex: sw_integer): string;
  54. function GetDocumentAliasCount(DocIndex: sw_integer): sw_integer;
  55. function GetDocumentAlias(DocIndex, AliasIndex: sw_integer): string;
  56. procedure StoreDocuments(var S: TStream);
  57. destructor Done; virtual;
  58. public
  59. procedure AddLink(const LinkText, LinkURL: string); virtual;
  60. private
  61. Documents: PHTMLLinkScanDocumentCollection;
  62. end;
  63. THTMLLinkScanState = (ssScheduled,ssProcessing,ssScanned);
  64. PHTMLLinkScanFile = ^THTMLLinkScanFile;
  65. THTMLLinkScanFile = object(TObject)
  66. constructor Init(const ADocumentURL: string);
  67. function GetDocumentURL: string;
  68. destructor Done; virtual;
  69. private
  70. DocumentURL : PString;
  71. public
  72. State : THTMLLinkScanState;
  73. end;
  74. PHTMLLinkScanFileCollection = ^THTMLLinkScanFileCollection;
  75. THTMLLinkScanFileCollection = object(TSortedCollection)
  76. function At(Index: sw_Integer): PHTMLLinkScanFile;
  77. function Compare(Key1, Key2: Pointer): sw_Integer; virtual;
  78. function SearchFile(const DocURL: string): PHTMLLinkScanFile;
  79. function FindFileWithState(AState: THTMLLinkScanState): PHTMLLinkScanFile;
  80. end;
  81. THTMLLinkScanOption = (soSubDocsOnly);
  82. THTMLLinkScanOptions = set of THTMLLinkScanOption;
  83. THTMLFileLinkScanner = object(THTMLLinkScanner)
  84. constructor Init;
  85. procedure ProcessDocument(const DocumentURL: string; AOptions: THTMLLinkScanOptions);
  86. destructor Done; virtual;
  87. public
  88. function GetDocumentBaseURL: string; virtual;
  89. procedure AddLink(const LinkText, LinkURL: string); virtual;
  90. function CheckURL(const URL: string): boolean; virtual;
  91. private
  92. Options: THTMLLinkScanOptions;
  93. BaseURL: string;
  94. CurBaseURL: string;
  95. DocumentFiles: PHTMLLinkScanFileCollection;
  96. procedure ScheduleDoc(const DocumentURL: string);
  97. public
  98. procedure ProcessDoc(Doc: PHTMLLinkScanFile); virtual;
  99. end;
  100. procedure RegisterWHTMLScan;
  101. implementation
  102. uses WUtils;
  103. const
  104. RHTMLLinkScanDocument: TStreamRec = (
  105. ObjType: 19500;
  106. VmtLink: Ofs(TypeOf(THTMLLinkScanDocument)^);
  107. Load: @THTMLLinkScanDocument.Load;
  108. Store: @THTMLLinkScanDocument.Store
  109. );
  110. function TCustomHTMLLinkScanner.DocAddTextChar(C: char): boolean;
  111. var Added: boolean;
  112. begin
  113. Added:=false;
  114. if InAnchor then
  115. begin
  116. CurLinkText:=CurLinkText+C;
  117. Added:=true;
  118. end;
  119. DocAddTextChar:=Added;
  120. end;
  121. procedure TCustomHTMLLinkScanner.DocAnchor(Entered: boolean);
  122. begin
  123. if Entered then
  124. begin
  125. CurLinkText:='';
  126. if DocGetTagParam('HREF',CurURL)=false then CurURL:='';
  127. CurURL:=Trim(CurURL);
  128. CurURL:=CompleteURL(GetDocumentBaseURL,CurURL);
  129. end
  130. else
  131. begin
  132. CurLinkText:=Trim(CurLinkText);
  133. if CheckURL(CurURL) and CheckText(CurLinkText) then
  134. AddLink(CurLinkText,CurURL);
  135. end;
  136. InAnchor:=Entered;
  137. end;
  138. function TCustomHTMLLinkScanner.GetDocumentBaseURL: string;
  139. begin
  140. { Abstract }
  141. GetDocumentBaseURL:='';
  142. end;
  143. function TCustomHTMLLinkScanner.CheckURL(const URL: string): boolean;
  144. begin
  145. { Abstract }
  146. CheckURL:=true;
  147. end;
  148. function TCustomHTMLLinkScanner.CheckText(const Text: string): boolean;
  149. begin
  150. { Abstract }
  151. CheckText:=true;
  152. end;
  153. procedure TCustomHTMLLinkScanner.AddLink(const LinkText, LinkURL: string);
  154. begin
  155. { Abstract }
  156. end;
  157. constructor THTMLLinkScanDocument.Init(const ADocName: string);
  158. begin
  159. inherited Init;
  160. SetStr(DocName,ADocName);
  161. New(Aliases, Init(10,10));
  162. end;
  163. function THTMLLinkScanDocument.GetName: string;
  164. begin
  165. GetName:=GetStr(DocName);
  166. end;
  167. function THTMLLinkScanDocument.GetAliasCount: sw_integer;
  168. begin
  169. GetAliasCount:=Aliases^.Count;
  170. end;
  171. function THTMLLinkScanDocument.GetAlias(Index: sw_integer): string;
  172. begin
  173. GetAlias:=GetStr(Aliases^.At(Index));
  174. end;
  175. procedure THTMLLinkScanDocument.AddAlias(const Alias: string);
  176. begin
  177. Aliases^.Insert(NewStr(Alias));
  178. end;
  179. constructor THTMLLinkScanDocument.Load(var S: TStream);
  180. begin
  181. inherited Init;
  182. DocName:=S.ReadStr;
  183. New(Aliases, Load(S));
  184. end;
  185. procedure THTMLLinkScanDocument.Store(var S: TStream);
  186. begin
  187. S.WriteStr(DocName);
  188. Aliases^.Store(S);
  189. end;
  190. destructor THTMLLinkScanDocument.Done;
  191. begin
  192. inherited Done;
  193. if Assigned(Aliases) then Dispose(Aliases, Done); Aliases:=nil;
  194. if Assigned(DocName) then DisposeStr(DocName); DocName:=nil;
  195. end;
  196. function THTMLLinkScanDocumentCollection.Compare(Key1, Key2: Pointer): sw_Integer;
  197. var R: sw_integer;
  198. K1: PHTMLLinkScanDocument absolute Key1;
  199. K2: PHTMLLinkScanDocument absolute Key2;
  200. S1,S2: string;
  201. begin
  202. S1:=UpcaseStr(K1^.GetName); S2:=UpcaseStr(K2^.GetName);
  203. if S1<S2 then R:=-1 else
  204. if S1>S2 then R:= 1 else
  205. R:=0;
  206. Compare:=R;
  207. end;
  208. function THTMLLinkScanDocumentCollection.At(Index: sw_Integer): PHTMLLinkScanDocument;
  209. begin
  210. At:=inherited At(Index);
  211. end;
  212. function THTMLLinkScanDocumentCollection.SearchDocument(const DocName: string): PHTMLLinkScanDocument;
  213. var D,P: PHTMLLinkScanDocument;
  214. Index: sw_integer;
  215. begin
  216. New(D, Init(DocName));
  217. if Search(D, Index)=false then P:=nil else
  218. P:=At(Index);
  219. Dispose(D, Done);
  220. SearchDocument:=P;
  221. end;
  222. constructor THTMLLinkScanner.Init;
  223. begin
  224. inherited Init;
  225. New(Documents, Init(50,100));
  226. end;
  227. function THTMLLinkScanner.GetDocumentCount: sw_integer;
  228. begin
  229. GetDocumentCount:=Documents^.Count;
  230. end;
  231. function THTMLLinkScanner.GetDocumentURL(DocIndex: sw_integer): string;
  232. begin
  233. GetDocumentURL:=Documents^.At(DocIndex)^.GetName;
  234. end;
  235. function THTMLLinkScanner.GetDocumentAliasCount(DocIndex: sw_integer): sw_integer;
  236. begin
  237. GetDocumentAliasCount:=Documents^.At(DocIndex)^.GetAliasCount;
  238. end;
  239. function THTMLLinkScanner.GetDocumentAlias(DocIndex, AliasIndex: sw_integer): string;
  240. begin
  241. GetDocumentAlias:=Documents^.At(DocIndex)^.GetAlias(AliasIndex);
  242. end;
  243. procedure THTMLLinkScanner.AddLink(const LinkText, LinkURL: string);
  244. var D: PHTMLLinkScanDocument;
  245. begin
  246. D:=Documents^.SearchDocument(LinkURL);
  247. if D=nil then
  248. begin
  249. New(D, Init(LinkURL));
  250. Documents^.Insert(D);
  251. end;
  252. D^.AddAlias(LinkText);
  253. end;
  254. procedure THTMLLinkScanner.StoreDocuments(var S: TStream);
  255. begin
  256. Documents^.Store(S);
  257. end;
  258. destructor THTMLLinkScanner.Done;
  259. begin
  260. inherited Done;
  261. if Assigned(Documents) then Dispose(Documents, Done); Documents:=nil;
  262. end;
  263. constructor THTMLLinkScanFile.Init(const ADocumentURL: string);
  264. begin
  265. inherited Init;
  266. SetStr(DocumentURL,ADocumentURL);
  267. end;
  268. function THTMLLinkScanFile.GetDocumentURL: string;
  269. begin
  270. GetDocumentURL:=GetStr(DocumentURL);
  271. end;
  272. destructor THTMLLinkScanFile.Done;
  273. begin
  274. inherited Done;
  275. if Assigned(DocumentURL) then DisposeStr(DocumentURL); DocumentURL:=nil;
  276. end;
  277. function THTMLLinkScanFileCollection.At(Index: sw_Integer): PHTMLLinkScanFile;
  278. begin
  279. At:=inherited At(Index);
  280. end;
  281. function THTMLLinkScanFileCollection.Compare(Key1, Key2: Pointer): sw_Integer;
  282. var R: integer;
  283. K1: PHTMLLinkScanFile absolute Key1;
  284. K2: PHTMLLinkScanFile absolute Key2;
  285. S1,S2: string;
  286. begin
  287. S1:=UpcaseStr(K1^.GetDocumentURL); S2:=UpcaseStr(K2^.GetDocumentURL);
  288. if S1<S2 then R:=-1 else
  289. if S1>S2 then R:= 1 else
  290. R:=0;
  291. Compare:=R;
  292. end;
  293. function THTMLLinkScanFileCollection.SearchFile(const DocURL: string): PHTMLLinkScanFile;
  294. var P,D: PHTMLLinkScanFile;
  295. Index: sw_integer;
  296. begin
  297. New(D, Init(DocURL));
  298. if Search(D,Index)=false then P:=nil else
  299. P:=At(Index);
  300. Dispose(D, Done);
  301. SearchFile:=P;
  302. end;
  303. function THTMLLinkScanFileCollection.FindFileWithState(AState: THTMLLinkScanState): PHTMLLinkScanFile;
  304. var I: sw_integer;
  305. P,D: PHTMLLinkScanFile;
  306. begin
  307. P:=nil;
  308. for I:=0 to Count-1 do
  309. begin
  310. D:=At(I);
  311. if D^.State=AState then
  312. begin
  313. P:=D;
  314. Break;
  315. end;
  316. end;
  317. FindFileWithState:=P;
  318. end;
  319. constructor THTMLFileLinkScanner.Init;
  320. begin
  321. inherited Init;
  322. New(DocumentFiles, Init(50,100));
  323. end;
  324. procedure THTMLFileLinkScanner.ProcessDocument(const DocumentURL: string; AOptions: THTMLLinkScanOptions);
  325. var P: PHTMLLinkScanFile;
  326. begin
  327. CurBaseURL:=''; Options:=AOptions;
  328. ScheduleDoc(DocumentURL);
  329. repeat
  330. P:=DocumentFiles^.FindFileWithState(ssScheduled);
  331. if Assigned(P) then
  332. ProcessDoc(P);
  333. until P=nil;
  334. end;
  335. function THTMLFileLinkScanner.GetDocumentBaseURL: string;
  336. begin
  337. GetDocumentBaseURL:=CurBaseURL;
  338. end;
  339. function THTMLFileLinkScanner.CheckURL(const URL: string): boolean;
  340. var OK: boolean;
  341. begin
  342. if soSubDocsOnly in Options then
  343. OK:=UpcaseStr(copy(URL,1,length(BaseURL)))=UpcaseStr(BaseURL)
  344. else
  345. OK:=true;
  346. CheckURL:=OK;
  347. end;
  348. procedure THTMLFileLinkScanner.AddLink(const LinkText, LinkURL: string);
  349. var D: PHTMLLinkScanFile;
  350. P: sw_integer;
  351. DocURL: string;
  352. begin
  353. P:=Pos('#',LinkURL);
  354. if P=0 then DocURL:=LinkURL else DocURL:=copy(LinkURL,1,P-1);
  355. D:=DocumentFiles^.SearchFile(DocURL);
  356. if Assigned(D)=false then
  357. ScheduleDoc(DocURL);
  358. inherited AddLink(LinkText,LinkURL);
  359. end;
  360. procedure THTMLFileLinkScanner.ProcessDoc(Doc: PHTMLLinkScanFile);
  361. var F: PDOSTextFile;
  362. begin
  363. if Assigned(Doc)=false then Exit;
  364. Doc^.State:=ssProcessing;
  365. New(F, Init(Doc^.GetDocumentURL));
  366. if Assigned(F) then
  367. begin
  368. CurBaseURL:=CompleteURL(Doc^.GetDocumentURL,'');
  369. Process(F);
  370. Dispose(F, Done);
  371. end;
  372. Doc^.State:=ssScanned;
  373. end;
  374. procedure THTMLFileLinkScanner.ScheduleDoc(const DocumentURL: string);
  375. var D: PHTMLLinkScanFile;
  376. begin
  377. New(D, Init(DocumentURL));
  378. D^.State:=ssScheduled;
  379. DocumentFiles^.Insert(D);
  380. end;
  381. destructor THTMLFileLinkScanner.Done;
  382. begin
  383. inherited Done;
  384. if Assigned(DocumentFiles) then Dispose(DocumentFiles, Done); DocumentFiles:=nil;
  385. end;
  386. procedure RegisterWHTMLScan;
  387. begin
  388. RegisterType(RHTMLLinkScanDocument);
  389. end;
  390. END.
  391. {
  392. $Log$
  393. Revision 1.1 2000-07-13 09:48:37 michael
  394. + Initial import
  395. Revision 1.7 2000/06/22 09:07:15 pierre
  396. * Gabor changes: see fixes.txt
  397. Revision 1.6 2000/05/29 11:09:14 pierre
  398. + New bunch of Gabor's changes: see fixes.txt
  399. Revision 1.5 2000/05/17 08:49:16 pierre
  400. readded
  401. Revision 1.1 2000/04/25 08:42:32 pierre
  402. * New Gabor changes : see fixes.txt
  403. }