whtmlscn.pas 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. {
  2. $Id$
  3. This file is part of the Free Pascal Integrated Development Environment
  4. Copyright (c) 2000 by Berczi Gabor
  5. HTML scanner objects
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. unit WHTMLScn;
  13. interface
  14. uses Objects,
  15. WHTML;
  16. const
  17. HTMLIndexMagicNo = ord('H')+ord('H') shl 8+ord('I') shl 16+ord('X') shl 24;
  18. HTMLIndexVersion = 1;
  19. type
  20. PHTMLLinkScanner = ^THTMLLinkScanner;
  21. TCustomHTMLLinkScanner = object(THTMLParser)
  22. function DocAddTextChar(C: char): boolean; virtual;
  23. procedure DocAnchor(Entered: boolean); virtual;
  24. public
  25. {a}function CheckURL(const URL: string): boolean; virtual;
  26. {a}function CheckText(const Text: string): boolean; virtual;
  27. {a}procedure AddLink(const LinkText, LinkURL: string); virtual;
  28. {a}function GetDocumentBaseURL: string; virtual;
  29. private
  30. CurLinkText: string;
  31. CurURL: string;
  32. InAnchor: boolean;
  33. end;
  34. PHTMLLinkScanDocument = ^THTMLLinkScanDocument;
  35. THTMLLinkScanDocument = object(TObject)
  36. constructor Init(const ADocName: string);
  37. function GetName: string;
  38. function GetAliasCount: sw_integer;
  39. function GetAlias(Index: sw_integer): string;
  40. procedure AddAlias(const Alias: string);
  41. constructor Load(var S: TStream);
  42. procedure Store(var S: TStream);
  43. destructor Done; virtual;
  44. private
  45. DocName: PString;
  46. Aliases: PStringCollection;
  47. end;
  48. PHTMLLinkScanDocumentCollection = ^THTMLLinkScanDocumentCollection;
  49. THTMLLinkScanDocumentCollection = object(TSortedCollection)
  50. constructor Init(AScanner: PHTMLLinkScanner; ALimit, ADelta: Integer);
  51. function Compare(Key1, Key2: Pointer): sw_Integer; virtual;
  52. function At(Index: sw_Integer): PHTMLLinkScanDocument;
  53. function SearchDocument(const DocName: string): PHTMLLinkScanDocument;
  54. private
  55. Scanner: PHTMLLinkScanner;
  56. end;
  57. THTMLLinkScanner = object(TCustomHTMLLinkScanner)
  58. constructor Init(const ABaseDir: string);
  59. procedure SetBaseDir(const ABaseDir: string);
  60. function GetDocumentCount: sw_integer;
  61. function GetDocumentURL(DocIndex: sw_integer): string;
  62. function GetDocumentAliasCount(DocIndex: sw_integer): sw_integer;
  63. function GetDocumentAlias(DocIndex, AliasIndex: sw_integer): string;
  64. constructor LoadDocuments(var S: TStream);
  65. procedure StoreDocuments(var S: TStream);
  66. destructor Done; virtual;
  67. public
  68. procedure AddLink(const LinkText, LinkURL: string); virtual;
  69. private
  70. Documents: PHTMLLinkScanDocumentCollection;
  71. BaseDir: PString;
  72. function ExpandChildURL(const S: string): string;
  73. function NormalizeChildURL(const S: string): string;
  74. end;
  75. THTMLLinkScanState = (ssScheduled,ssProcessing,ssScanned);
  76. PHTMLLinkScanFile = ^THTMLLinkScanFile;
  77. THTMLLinkScanFile = object(TObject)
  78. constructor Init(const ADocumentURL: string);
  79. function GetDocumentURL: string;
  80. destructor Done; virtual;
  81. private
  82. DocumentURL : PString;
  83. public
  84. State : THTMLLinkScanState;
  85. end;
  86. PHTMLLinkScanFileCollection = ^THTMLLinkScanFileCollection;
  87. THTMLLinkScanFileCollection = object(TSortedCollection)
  88. function At(Index: sw_Integer): PHTMLLinkScanFile;
  89. function Compare(Key1, Key2: Pointer): sw_Integer; virtual;
  90. function SearchFile(const DocURL: string): PHTMLLinkScanFile;
  91. function FindFileWithState(AState: THTMLLinkScanState): PHTMLLinkScanFile;
  92. end;
  93. THTMLLinkScanOption = (soSubDocsOnly);
  94. THTMLLinkScanOptions = set of THTMLLinkScanOption;
  95. THTMLFileLinkScanner = object(THTMLLinkScanner)
  96. constructor Init(const ABaseDir: string);
  97. procedure ProcessDocument(const DocumentURL: string; AOptions: THTMLLinkScanOptions);
  98. destructor Done; virtual;
  99. public
  100. function GetDocumentBaseURL: string; virtual;
  101. procedure AddLink(const LinkText, LinkURL: string); virtual;
  102. function CheckURL(const URL: string): boolean; virtual;
  103. private
  104. Options: THTMLLinkScanOptions;
  105. BaseURL: string;
  106. CurBaseURL: string;
  107. DocumentFiles: PHTMLLinkScanFileCollection;
  108. procedure ScheduleDoc(const DocumentURL: string);
  109. public
  110. procedure ProcessDoc(Doc: PHTMLLinkScanFile); virtual;
  111. end;
  112. procedure RegisterWHTMLScan;
  113. implementation
  114. uses WUtils;
  115. const
  116. RHTMLLinkScanDocument: TStreamRec = (
  117. ObjType: 19500;
  118. VmtLink: Ofs(TypeOf(THTMLLinkScanDocument)^);
  119. Load: @THTMLLinkScanDocument.Load;
  120. Store: @THTMLLinkScanDocument.Store
  121. );
  122. function TCustomHTMLLinkScanner.DocAddTextChar(C: char): boolean;
  123. var Added: boolean;
  124. begin
  125. Added:=false;
  126. if InAnchor then
  127. begin
  128. CurLinkText:=CurLinkText+C;
  129. Added:=true;
  130. end;
  131. DocAddTextChar:=Added;
  132. end;
  133. procedure TCustomHTMLLinkScanner.DocAnchor(Entered: boolean);
  134. begin
  135. if Entered then
  136. begin
  137. CurLinkText:='';
  138. if DocGetTagParam('HREF',CurURL)=false then CurURL:='';
  139. CurURL:=Trim(CurURL);
  140. CurURL:=CompleteURL(GetDocumentBaseURL,CurURL);
  141. end
  142. else
  143. begin
  144. CurLinkText:=Trim(CurLinkText);
  145. if CheckURL(CurURL) and CheckText(CurLinkText) then
  146. AddLink(CurLinkText,CurURL);
  147. end;
  148. InAnchor:=Entered;
  149. end;
  150. function TCustomHTMLLinkScanner.GetDocumentBaseURL: string;
  151. begin
  152. { Abstract }
  153. GetDocumentBaseURL:='';
  154. end;
  155. function TCustomHTMLLinkScanner.CheckURL(const URL: string): boolean;
  156. begin
  157. { Abstract }
  158. CheckURL:=true;
  159. end;
  160. function TCustomHTMLLinkScanner.CheckText(const Text: string): boolean;
  161. begin
  162. { Abstract }
  163. CheckText:=true;
  164. end;
  165. procedure TCustomHTMLLinkScanner.AddLink(const LinkText, LinkURL: string);
  166. begin
  167. { Abstract }
  168. end;
  169. constructor THTMLLinkScanDocument.Init(const ADocName: string);
  170. begin
  171. inherited Init;
  172. SetStr(DocName,ADocName);
  173. New(Aliases, Init(10,10));
  174. end;
  175. function THTMLLinkScanDocument.GetName: string;
  176. begin
  177. GetName:=GetStr(DocName);
  178. end;
  179. function THTMLLinkScanDocument.GetAliasCount: sw_integer;
  180. begin
  181. GetAliasCount:=Aliases^.Count;
  182. end;
  183. function THTMLLinkScanDocument.GetAlias(Index: sw_integer): string;
  184. begin
  185. GetAlias:=GetStr(Aliases^.At(Index));
  186. end;
  187. procedure THTMLLinkScanDocument.AddAlias(const Alias: string);
  188. begin
  189. Aliases^.Insert(NewStr(Alias));
  190. end;
  191. constructor THTMLLinkScanDocument.Load(var S: TStream);
  192. begin
  193. inherited Init;
  194. DocName:=S.ReadStr;
  195. New(Aliases, Load(S));
  196. end;
  197. procedure THTMLLinkScanDocument.Store(var S: TStream);
  198. begin
  199. S.WriteStr(DocName);
  200. Aliases^.Store(S);
  201. end;
  202. destructor THTMLLinkScanDocument.Done;
  203. begin
  204. inherited Done;
  205. if Assigned(Aliases) then Dispose(Aliases, Done); Aliases:=nil;
  206. if Assigned(DocName) then DisposeStr(DocName); DocName:=nil;
  207. end;
  208. constructor THTMLLinkScanDocumentCollection.Init(AScanner: PHTMLLinkScanner; ALimit, ADelta: Integer);
  209. begin
  210. inherited Init(ALimit,ADelta);
  211. Scanner:=AScanner;
  212. end;
  213. function THTMLLinkScanDocumentCollection.Compare(Key1, Key2: Pointer): sw_Integer;
  214. var R: sw_integer;
  215. K1: PHTMLLinkScanDocument absolute Key1;
  216. K2: PHTMLLinkScanDocument absolute Key2;
  217. S1,S2: string;
  218. begin
  219. S1:=K1^.GetName; S2:=K2^.GetName;
  220. if Assigned(Scanner) then
  221. begin S1:=Scanner^.ExpandChildURL(S1); S2:=Scanner^.ExpandChildURL(S2); end;
  222. S1:=UpcaseStr(S1); S2:=UpcaseStr(S2);
  223. if S1<S2 then R:=-1 else
  224. if S1>S2 then R:= 1 else
  225. R:=0;
  226. Compare:=R;
  227. end;
  228. function THTMLLinkScanDocumentCollection.At(Index: sw_Integer): PHTMLLinkScanDocument;
  229. begin
  230. At:=inherited At(Index);
  231. end;
  232. function THTMLLinkScanDocumentCollection.SearchDocument(const DocName: string): PHTMLLinkScanDocument;
  233. var D,P: PHTMLLinkScanDocument;
  234. Index: sw_integer;
  235. begin
  236. New(D, Init(DocName));
  237. if Search(D, Index)=false then P:=nil else
  238. P:=At(Index);
  239. Dispose(D, Done);
  240. SearchDocument:=P;
  241. end;
  242. constructor THTMLLinkScanner.Init(const ABaseDir: string);
  243. begin
  244. inherited Init;
  245. New(Documents, Init(@Self,50,100));
  246. SetBaseDir(ABaseDir);
  247. end;
  248. procedure THTMLLinkScanner.SetBaseDir(const ABaseDir: string);
  249. begin
  250. if Assigned(BaseDir) then DisposeStr(BaseDir);
  251. BaseDir:=NewStr(CompleteDir(ABaseDir));
  252. end;
  253. function THTMLLinkScanner.GetDocumentCount: sw_integer;
  254. begin
  255. GetDocumentCount:=Documents^.Count;
  256. end;
  257. function THTMLLinkScanner.ExpandChildURL(const S: string): string;
  258. begin
  259. ExpandChildURL:=CompleteURL(GetStr(BaseDir),S);
  260. end;
  261. function THTMLLinkScanner.NormalizeChildURL(const S: string): string;
  262. var URL: string;
  263. begin
  264. URL:=S;
  265. if GetStr(BaseDir)<>'' then
  266. if copy(UpcaseStr(S),1,length(GetStr(BaseDir)))=UpcaseStr(GetStr(BaseDir)) then
  267. URL:=copy(S,length(GetStr(BaseDir))+1,length(S));
  268. NormalizeChildURL:=URL;
  269. end;
  270. function THTMLLinkScanner.GetDocumentURL(DocIndex: sw_integer): string;
  271. begin
  272. GetDocumentURL:=ExpandChildURL(Documents^.At(DocIndex)^.GetName);
  273. end;
  274. function THTMLLinkScanner.GetDocumentAliasCount(DocIndex: sw_integer): sw_integer;
  275. begin
  276. GetDocumentAliasCount:=Documents^.At(DocIndex)^.GetAliasCount;
  277. end;
  278. function THTMLLinkScanner.GetDocumentAlias(DocIndex, AliasIndex: sw_integer): string;
  279. begin
  280. GetDocumentAlias:=Documents^.At(DocIndex)^.GetAlias(AliasIndex);
  281. end;
  282. procedure THTMLLinkScanner.AddLink(const LinkText, LinkURL: string);
  283. var D: PHTMLLinkScanDocument;
  284. begin
  285. D:=Documents^.SearchDocument(LinkURL);
  286. if D=nil then
  287. begin
  288. New(D, Init(NormalizeChildURL(LinkURL)));
  289. Documents^.Insert(D);
  290. end;
  291. D^.AddAlias(LinkText);
  292. end;
  293. constructor THTMLLinkScanner.LoadDocuments(var S: TStream);
  294. var P,L: longint;
  295. OK: boolean;
  296. PS: PString;
  297. begin
  298. OK:=false;
  299. P:=S.GetPos;
  300. S.Read(L,sizeof(L));
  301. if (S.Status=stOK) and (L=HTMLIndexMagicNo) then
  302. begin
  303. S.Read(L,sizeof(L));
  304. OK:=(S.Status=stOK);
  305. end;
  306. if not OK then
  307. begin
  308. S.Reset;
  309. S.Seek(P);
  310. end
  311. else
  312. BaseDir:=S.ReadStr;
  313. New(Documents, Load(S));
  314. if not Assigned(Documents) then
  315. Fail;
  316. end;
  317. procedure THTMLLinkScanner.StoreDocuments(var S: TStream);
  318. var L: longint;
  319. begin
  320. L:=HTMLIndexMagicNo;
  321. S.Write(L,sizeof(L));
  322. L:=HTMLIndexVersion;
  323. S.Write(L,sizeof(L));
  324. S.WriteStr(BaseDir);
  325. Documents^.Store(S);
  326. end;
  327. destructor THTMLLinkScanner.Done;
  328. begin
  329. inherited Done;
  330. if Assigned(Documents) then Dispose(Documents, Done); Documents:=nil;
  331. if Assigned(BaseDir) then DisposeStr(BaseDir); BaseDir:=nil;
  332. end;
  333. constructor THTMLLinkScanFile.Init(const ADocumentURL: string);
  334. begin
  335. inherited Init;
  336. SetStr(DocumentURL,ADocumentURL);
  337. end;
  338. function THTMLLinkScanFile.GetDocumentURL: string;
  339. begin
  340. GetDocumentURL:=GetStr(DocumentURL);
  341. end;
  342. destructor THTMLLinkScanFile.Done;
  343. begin
  344. inherited Done;
  345. if Assigned(DocumentURL) then DisposeStr(DocumentURL); DocumentURL:=nil;
  346. end;
  347. function THTMLLinkScanFileCollection.At(Index: sw_Integer): PHTMLLinkScanFile;
  348. begin
  349. At:=inherited At(Index);
  350. end;
  351. function THTMLLinkScanFileCollection.Compare(Key1, Key2: Pointer): sw_Integer;
  352. var R: integer;
  353. K1: PHTMLLinkScanFile absolute Key1;
  354. K2: PHTMLLinkScanFile absolute Key2;
  355. S1,S2: string;
  356. begin
  357. S1:=UpcaseStr(K1^.GetDocumentURL); S2:=UpcaseStr(K2^.GetDocumentURL);
  358. if S1<S2 then R:=-1 else
  359. if S1>S2 then R:= 1 else
  360. R:=0;
  361. Compare:=R;
  362. end;
  363. function THTMLLinkScanFileCollection.SearchFile(const DocURL: string): PHTMLLinkScanFile;
  364. var P,D: PHTMLLinkScanFile;
  365. Index: sw_integer;
  366. begin
  367. New(D, Init(DocURL));
  368. if Search(D,Index)=false then P:=nil else
  369. P:=At(Index);
  370. Dispose(D, Done);
  371. SearchFile:=P;
  372. end;
  373. function THTMLLinkScanFileCollection.FindFileWithState(AState: THTMLLinkScanState): PHTMLLinkScanFile;
  374. var I: sw_integer;
  375. P,D: PHTMLLinkScanFile;
  376. begin
  377. P:=nil;
  378. for I:=0 to Count-1 do
  379. begin
  380. D:=At(I);
  381. if D^.State=AState then
  382. begin
  383. P:=D;
  384. Break;
  385. end;
  386. end;
  387. FindFileWithState:=P;
  388. end;
  389. constructor THTMLFileLinkScanner.Init(const ABaseDir: string);
  390. begin
  391. inherited Init(ABaseDir);
  392. New(DocumentFiles, Init(50,100));
  393. end;
  394. procedure THTMLFileLinkScanner.ProcessDocument(const DocumentURL: string; AOptions: THTMLLinkScanOptions);
  395. var P: PHTMLLinkScanFile;
  396. begin
  397. CurBaseURL:=''; Options:=AOptions;
  398. ScheduleDoc(DocumentURL);
  399. repeat
  400. P:=DocumentFiles^.FindFileWithState(ssScheduled);
  401. if Assigned(P) then
  402. ProcessDoc(P);
  403. until P=nil;
  404. end;
  405. function THTMLFileLinkScanner.GetDocumentBaseURL: string;
  406. begin
  407. GetDocumentBaseURL:=CurBaseURL;
  408. end;
  409. function THTMLFileLinkScanner.CheckURL(const URL: string): boolean;
  410. var OK: boolean;
  411. begin
  412. if soSubDocsOnly in Options then
  413. OK:=UpcaseStr(copy(URL,1,length(BaseURL)))=UpcaseStr(BaseURL)
  414. else
  415. OK:=true;
  416. CheckURL:=OK;
  417. end;
  418. procedure THTMLFileLinkScanner.AddLink(const LinkText, LinkURL: string);
  419. var D: PHTMLLinkScanFile;
  420. P: sw_integer;
  421. DocURL: string;
  422. begin
  423. P:=Pos('#',LinkURL);
  424. if P=0 then DocURL:=LinkURL else DocURL:=copy(LinkURL,1,P-1);
  425. D:=DocumentFiles^.SearchFile(DocURL);
  426. if Assigned(D)=false then
  427. ScheduleDoc(DocURL);
  428. inherited AddLink(LinkText,LinkURL);
  429. end;
  430. procedure THTMLFileLinkScanner.ProcessDoc(Doc: PHTMLLinkScanFile);
  431. var F: PDOSTextFile;
  432. begin
  433. if Assigned(Doc)=false then Exit;
  434. Doc^.State:=ssProcessing;
  435. New(F, Init(Doc^.GetDocumentURL));
  436. if Assigned(F) then
  437. begin
  438. CurBaseURL:=CompleteURL(Doc^.GetDocumentURL,'');
  439. Process(F);
  440. Dispose(F, Done);
  441. end;
  442. Doc^.State:=ssScanned;
  443. end;
  444. procedure THTMLFileLinkScanner.ScheduleDoc(const DocumentURL: string);
  445. var D: PHTMLLinkScanFile;
  446. begin
  447. New(D, Init(DocumentURL));
  448. D^.State:=ssScheduled;
  449. DocumentFiles^.Insert(D);
  450. end;
  451. destructor THTMLFileLinkScanner.Done;
  452. begin
  453. inherited Done;
  454. if Assigned(DocumentFiles) then Dispose(DocumentFiles, Done); DocumentFiles:=nil;
  455. end;
  456. procedure RegisterWHTMLScan;
  457. begin
  458. RegisterType(RHTMLLinkScanDocument);
  459. end;
  460. END.
  461. {
  462. $Log$
  463. Revision 1.2 2000-10-31 22:35:56 pierre
  464. * New big merge from fixes branch
  465. Revision 1.1.2.1 2000/10/18 21:53:28 pierre
  466. * several Gabor fixes
  467. Revision 1.1 2000/07/13 09:48:37 michael
  468. + Initial import
  469. Revision 1.7 2000/06/22 09:07:15 pierre
  470. * Gabor changes: see fixes.txt
  471. Revision 1.6 2000/05/29 11:09:14 pierre
  472. + New bunch of Gabor's changes: see fixes.txt
  473. Revision 1.5 2000/05/17 08:49:16 pierre
  474. readded
  475. Revision 1.1 2000/04/25 08:42:32 pierre
  476. * New Gabor changes : see fixes.txt
  477. }