whtmlscn.pas 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618
  1. {
  2. This file is part of the Free Pascal Integrated Development Environment
  3. Copyright (c) 2000 by Berczi Gabor
  4. HTML scanner objects
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. unit WHTMLScn;
  12. interface
  13. uses Objects,
  14. WHTML;
  15. const
  16. HTMLIndexMagicNo = ord('H')+ord('H') shl 8+ord('I') shl 16+ord('X') shl 24;
  17. HTMLIndexVersion = 2;
  18. type
  19. PHTMLLinkScanner = ^THTMLLinkScanner;
  20. PHTMLLinkScanDocument = ^THTMLLinkScanDocument;
  21. TCustomHTMLLinkScanner = object(THTMLParser)
  22. function DocAddTextChar(C: char): boolean; virtual;
  23. procedure DocAnchor(Entered: boolean); virtual;
  24. public
  25. {a}function CheckURL(const URL: string): boolean; virtual;
  26. {a}function CheckText(const Text: string): boolean; virtual;
  27. {a}procedure AddLink(const LinkText, LinkURL: string); virtual;
  28. {a}function GetDocumentBaseURL: string; virtual;
  29. private
  30. CurLinkText: string;
  31. CurURL: string;
  32. CurDoc: string;
  33. InAnchor,InNameAnchor: boolean;
  34. LastSynonym: PHTMLLinkScanDocument;
  35. end;
  36. THTMLLinkScanDocument = object(TObject)
  37. constructor Init(const ADocName: string);
  38. function GetName: string;
  39. function GetUniqueName: string;
  40. function GetAliasCount: sw_integer;
  41. function GetAlias(Index: sw_integer): string;
  42. procedure AddAlias(const Alias: string);
  43. constructor Load(var S: TStream);
  44. procedure Store(var S: TStream);
  45. destructor Done; virtual;
  46. private
  47. DocName: PString;
  48. Synonym: PHTMLLinkScanDocument;
  49. Aliases: PStringCollection;
  50. end;
  51. PHTMLLinkScanDocumentCollection = ^THTMLLinkScanDocumentCollection;
  52. THTMLLinkScanDocumentCollection = object(TSortedCollection)
  53. constructor Init(AScanner: PHTMLLinkScanner; ALimit, ADelta: Integer);
  54. function Compare(Key1, Key2: Pointer): sw_Integer; virtual;
  55. function At(Index: sw_Integer): PHTMLLinkScanDocument;
  56. function SearchDocument(const DocName: string): PHTMLLinkScanDocument;
  57. procedure MoveAliasesToSynonym;
  58. private
  59. Scanner: PHTMLLinkScanner;
  60. end;
  61. THTMLLinkScanner = object(TCustomHTMLLinkScanner)
  62. constructor Init(const ABaseDir: string);
  63. procedure SetBaseDir(const ABaseDir: string);
  64. function GetDocumentCount: sw_integer;
  65. function GetDocumentURL(DocIndex: sw_integer): string;
  66. function GetUniqueDocumentURL(DocIndex: sw_integer): string;
  67. function GetDocumentAliasCount(DocIndex: sw_integer): sw_integer;
  68. function GetDocumentAlias(DocIndex, AliasIndex: sw_integer): string;
  69. constructor LoadDocuments(var S: TStream);
  70. procedure StoreDocuments(var S: TStream);
  71. destructor Done; virtual;
  72. public
  73. procedure AddLink(const LinkText, LinkURL: string); virtual;
  74. private
  75. Documents: PHTMLLinkScanDocumentCollection;
  76. BaseDir: PString;
  77. function ExpandChildURL(const S: string): string;
  78. function NormalizeChildURL(const S: string): string;
  79. end;
  80. THTMLLinkScanState = (ssScheduled,ssProcessing,ssScanned);
  81. PHTMLLinkScanFile = ^THTMLLinkScanFile;
  82. THTMLLinkScanFile = object(TObject)
  83. constructor Init(const ADocumentURL: string);
  84. function GetDocumentURL: string;
  85. destructor Done; virtual;
  86. private
  87. DocumentURL : PString;
  88. public
  89. State : THTMLLinkScanState;
  90. end;
  91. PHTMLLinkScanFileCollection = ^THTMLLinkScanFileCollection;
  92. THTMLLinkScanFileCollection = object(TSortedCollection)
  93. function At(Index: sw_Integer): PHTMLLinkScanFile;
  94. function Compare(Key1, Key2: Pointer): sw_Integer; virtual;
  95. function SearchFile(const DocURL: string): PHTMLLinkScanFile;
  96. function FindFileWithState(AState: THTMLLinkScanState): PHTMLLinkScanFile;
  97. end;
  98. THTMLLinkScanOption = (soSubDocsOnly);
  99. THTMLLinkScanOptions = set of THTMLLinkScanOption;
  100. THTMLFileLinkScanner = object(THTMLLinkScanner)
  101. constructor Init(const ABaseDir: string);
  102. procedure ProcessDocument(const DocumentURL: string; AOptions: THTMLLinkScanOptions);
  103. destructor Done; virtual;
  104. public
  105. function GetDocumentBaseURL: string; virtual;
  106. procedure AddLink(const LinkText, LinkURL: string); virtual;
  107. function CheckURL(const URL: string): boolean; virtual;
  108. private
  109. Options: THTMLLinkScanOptions;
  110. BaseURL: string;
  111. CurBaseURL: string;
  112. DocumentFiles: PHTMLLinkScanFileCollection;
  113. procedure ScheduleDoc(const DocumentURL: string);
  114. public
  115. procedure ProcessDoc(Doc: PHTMLLinkScanFile); virtual;
  116. end;
  117. procedure RegisterWHTMLScan;
  118. implementation
  119. uses WUtils;
  120. const
  121. RHTMLLinkScanDocument: TStreamRec = (
  122. ObjType: 19500;
  123. VmtLink: Ofs(TypeOf(THTMLLinkScanDocument)^);
  124. Load: @THTMLLinkScanDocument.Load;
  125. Store: @THTMLLinkScanDocument.Store
  126. );
  127. const
  128. CurrentHTMLIndexVersion : sw_integer = HTMLIndexVersion;
  129. function TCustomHTMLLinkScanner.DocAddTextChar(C: char): boolean;
  130. var Added: boolean;
  131. begin
  132. Added:=false;
  133. if InAnchor then
  134. begin
  135. CurLinkText:=CurLinkText+C;
  136. Added:=true;
  137. end;
  138. if ord(c)>32 then
  139. LastSynonym:=nil;
  140. DocAddTextChar:=Added;
  141. end;
  142. procedure TCustomHTMLLinkScanner.DocAnchor(Entered: boolean);
  143. begin
  144. if Entered then
  145. begin
  146. CurLinkText:='';
  147. if DocGetTagParam('HREF',CurURL)=false then
  148. if DocGetTagParam('NAME',CurURL) then
  149. begin
  150. InNameAnchor:=true;
  151. If Pos('#',CurURL)=0 then
  152. CurURL:=CurDoc+'#'+CurURL;
  153. end
  154. else
  155. CurURL:='';
  156. CurURL:=Trim(CurURL);
  157. CurURL:=CompleteURL(GetDocumentBaseURL,CurURL);
  158. end
  159. else
  160. begin
  161. CurLinkText:=Trim(CurLinkText);
  162. if CheckURL(CurURL) and CheckText(CurLinkText) or InNameAnchor then
  163. AddLink(CurLinkText,CurURL);
  164. InNameAnchor:=false;
  165. end;
  166. InAnchor:=Entered;
  167. end;
  168. function TCustomHTMLLinkScanner.GetDocumentBaseURL: string;
  169. begin
  170. { Abstract }
  171. GetDocumentBaseURL:='';
  172. end;
  173. function TCustomHTMLLinkScanner.CheckURL(const URL: string): boolean;
  174. begin
  175. { Abstract }
  176. CheckURL:=true;
  177. end;
  178. function TCustomHTMLLinkScanner.CheckText(const Text: string): boolean;
  179. begin
  180. { Abstract }
  181. CheckText:=true;
  182. end;
  183. procedure TCustomHTMLLinkScanner.AddLink(const LinkText, LinkURL: string);
  184. begin
  185. { Abstract }
  186. end;
  187. constructor THTMLLinkScanDocument.Init(const ADocName: string);
  188. begin
  189. inherited Init;
  190. SetStr(DocName,ADocName);
  191. New(Aliases, Init(10,10));
  192. Synonym:=nil;
  193. end;
  194. function THTMLLinkScanDocument.GetName: string;
  195. begin
  196. GetName:=GetStr(DocName);
  197. end;
  198. function THTMLLinkScanDocument.GetUniqueName: string;
  199. var
  200. PD: PHTMLLinkScanDocument;
  201. begin
  202. PD:=@Self;
  203. while assigned(PD^.synonym) do
  204. PD:=PD^.Synonym;
  205. GetUniqueName:=GetStr(PD^.DocName);
  206. end;
  207. function THTMLLinkScanDocument.GetAliasCount: sw_integer;
  208. begin
  209. GetAliasCount:=Aliases^.Count;
  210. end;
  211. function THTMLLinkScanDocument.GetAlias(Index: sw_integer): string;
  212. begin
  213. GetAlias:=GetStr(Aliases^.At(Index));
  214. end;
  215. procedure THTMLLinkScanDocument.AddAlias(const Alias: string);
  216. begin
  217. Aliases^.Insert(NewStr(Alias));
  218. end;
  219. constructor THTMLLinkScanDocument.Load(var S: TStream);
  220. var
  221. i: sw_integer;
  222. begin
  223. inherited Init;
  224. DocName:=S.ReadStr;
  225. if assigned(DocName) then
  226. for i:=1 to Length(DocName^) do
  227. if (DocName^[i]='\') or (DocName^[i]='/') then
  228. DocName^[i]:=DirSep;
  229. New(Aliases, Load(S));
  230. end;
  231. procedure THTMLLinkScanDocument.Store(var S: TStream);
  232. begin
  233. S.WriteStr(DocName);
  234. Aliases^.Store(S);
  235. end;
  236. destructor THTMLLinkScanDocument.Done;
  237. begin
  238. inherited Done;
  239. if Assigned(Aliases) then Dispose(Aliases, Done); Aliases:=nil;
  240. if Assigned(DocName) then DisposeStr(DocName); DocName:=nil;
  241. end;
  242. constructor THTMLLinkScanDocumentCollection.Init(AScanner: PHTMLLinkScanner; ALimit, ADelta: Integer);
  243. begin
  244. inherited Init(ALimit,ADelta);
  245. Scanner:=AScanner;
  246. end;
  247. function THTMLLinkScanDocumentCollection.Compare(Key1, Key2: Pointer): sw_Integer;
  248. var R: sw_integer;
  249. K1: PHTMLLinkScanDocument absolute Key1;
  250. K2: PHTMLLinkScanDocument absolute Key2;
  251. S1,S2: string;
  252. begin
  253. S1:=K1^.GetName; S2:=K2^.GetName;
  254. if Assigned(Scanner) then
  255. begin S1:=Scanner^.ExpandChildURL(S1); S2:=Scanner^.ExpandChildURL(S2); end;
  256. S1:=UpcaseStr(S1); S2:=UpcaseStr(S2);
  257. if S1<S2 then R:=-1 else
  258. if S1>S2 then R:= 1 else
  259. R:=0;
  260. Compare:=R;
  261. end;
  262. function THTMLLinkScanDocumentCollection.At(Index: sw_Integer): PHTMLLinkScanDocument;
  263. begin
  264. At:=inherited At(Index);
  265. end;
  266. function THTMLLinkScanDocumentCollection.SearchDocument(const DocName: string): PHTMLLinkScanDocument;
  267. var D,P: PHTMLLinkScanDocument;
  268. Index: sw_integer;
  269. begin
  270. New(D, Init(DocName));
  271. if Search(D, Index)=false then P:=nil else
  272. P:=At(Index);
  273. Dispose(D, Done);
  274. SearchDocument:=P;
  275. end;
  276. procedure THTMLLinkScanDocumentCollection.MoveAliasesToSynonym;
  277. procedure MoveAliases(P: PHTMLLinkScanDocument);
  278. var
  279. PD: PHTMLLinkScanDocument;
  280. i: sw_integer;
  281. begin
  282. if not assigned(P^.synonym) then
  283. exit;
  284. PD:=P;
  285. while assigned(PD^.synonym) do
  286. PD:=PD^.Synonym;
  287. For i:=P^.GetAliasCount-1 downto 0 do
  288. begin
  289. PD^.AddAlias(P^.GetAlias(i));
  290. P^.Aliases^.AtFree(i);
  291. end;
  292. end;
  293. begin
  294. ForEach(@MoveAliases);
  295. end;
  296. constructor THTMLLinkScanner.Init(const ABaseDir: string);
  297. begin
  298. inherited Init;
  299. New(Documents, Init(@Self,50,100));
  300. SetBaseDir(ABaseDir);
  301. end;
  302. procedure THTMLLinkScanner.SetBaseDir(const ABaseDir: string);
  303. begin
  304. if Assigned(BaseDir) then DisposeStr(BaseDir);
  305. BaseDir:=NewStr(CompleteDir(ABaseDir));
  306. end;
  307. function THTMLLinkScanner.GetDocumentCount: sw_integer;
  308. begin
  309. GetDocumentCount:=Documents^.Count;
  310. end;
  311. function THTMLLinkScanner.ExpandChildURL(const S: string): string;
  312. begin
  313. ExpandChildURL:=CompleteURL(GetStr(BaseDir),S);
  314. end;
  315. function THTMLLinkScanner.NormalizeChildURL(const S: string): string;
  316. var URL: string;
  317. begin
  318. URL:=S;
  319. if GetStr(BaseDir)<>'' then
  320. if copy(UpcaseStr(S),1,length(GetStr(BaseDir)))=UpcaseStr(GetStr(BaseDir)) then
  321. URL:=copy(S,length(GetStr(BaseDir))+1,length(S));
  322. NormalizeChildURL:=URL;
  323. end;
  324. function THTMLLinkScanner.GetDocumentURL(DocIndex: sw_integer): string;
  325. begin
  326. GetDocumentURL:=ExpandChildURL(Documents^.At(DocIndex)^.GetName);
  327. end;
  328. function THTMLLinkScanner.GetUniqueDocumentURL(DocIndex: sw_integer): string;
  329. begin
  330. GetUniqueDocumentURL:=ExpandChildURL(Documents^.At(DocIndex)^.GetUniqueName);
  331. end;
  332. function THTMLLinkScanner.GetDocumentAliasCount(DocIndex: sw_integer): sw_integer;
  333. begin
  334. GetDocumentAliasCount:=Documents^.At(DocIndex)^.GetAliasCount;
  335. end;
  336. function THTMLLinkScanner.GetDocumentAlias(DocIndex, AliasIndex: sw_integer): string;
  337. begin
  338. GetDocumentAlias:=Documents^.At(DocIndex)^.GetAlias(AliasIndex);
  339. end;
  340. procedure THTMLLinkScanner.AddLink(const LinkText, LinkURL: string);
  341. var D: PHTMLLinkScanDocument;
  342. DoInsert: boolean;
  343. int: sw_integer;
  344. Text: string;
  345. error: word;
  346. begin
  347. D:=Documents^.SearchDocument(LinkURL);
  348. if D=nil then
  349. begin
  350. New(D, Init(NormalizeChildURL(LinkURL)));
  351. Documents^.Insert(D);
  352. end;
  353. If assigned(LastSynonym) then
  354. LastSynonym^.Synonym:=D;
  355. DoInsert:=true;
  356. If (length(LinkText)=0) or (Pos(',',LinkText)=1) then
  357. DoInsert:=false;
  358. Val(LinkText,int,error);
  359. If (Error>1) and (LinkText[Error]=' ') then
  360. Text:=Trim(Copy(LinkText,error+1,length(LinkText)))
  361. else
  362. Text:=LinkText;
  363. IF DoInsert then
  364. D^.AddAlias(Text);
  365. If InNameAnchor then
  366. LastSynonym:=D;
  367. end;
  368. constructor THTMLLinkScanner.LoadDocuments(var S: TStream);
  369. var P,L: longint;
  370. OK: boolean;
  371. PS: PString;
  372. begin
  373. OK:=false;
  374. P:=S.GetPos;
  375. S.Read(L,sizeof(L));
  376. if (S.Status=stOK) and (L=HTMLIndexMagicNo) then
  377. begin
  378. S.Read(L,sizeof(L));
  379. CurrentHTMLIndexVersion:=L;
  380. OK:=(S.Status=stOK);
  381. end;
  382. if not OK then
  383. begin
  384. S.Reset;
  385. S.Seek(P);
  386. end
  387. else
  388. BaseDir:=S.ReadStr;
  389. New(Documents, Load(S));
  390. if not Assigned(Documents) then
  391. Fail;
  392. Documents^.MoveAliasesToSynonym;
  393. CurrentHTMLIndexVersion:=HTMLIndexVersion;
  394. end;
  395. procedure THTMLLinkScanner.StoreDocuments(var S: TStream);
  396. var L: longint;
  397. begin
  398. L:=HTMLIndexMagicNo;
  399. S.Write(L,sizeof(L));
  400. L:=HTMLIndexVersion;
  401. CurrentHTMLIndexVersion:=L;
  402. S.Write(L,sizeof(L));
  403. S.WriteStr(BaseDir);
  404. Documents^.MoveAliasesToSynonym;
  405. Documents^.Store(S);
  406. end;
  407. destructor THTMLLinkScanner.Done;
  408. begin
  409. inherited Done;
  410. if Assigned(Documents) then Dispose(Documents, Done); Documents:=nil;
  411. if Assigned(BaseDir) then DisposeStr(BaseDir); BaseDir:=nil;
  412. end;
  413. constructor THTMLLinkScanFile.Init(const ADocumentURL: string);
  414. begin
  415. inherited Init;
  416. SetStr(DocumentURL,ADocumentURL);
  417. end;
  418. function THTMLLinkScanFile.GetDocumentURL: string;
  419. begin
  420. GetDocumentURL:=GetStr(DocumentURL);
  421. end;
  422. destructor THTMLLinkScanFile.Done;
  423. begin
  424. inherited Done;
  425. if Assigned(DocumentURL) then DisposeStr(DocumentURL); DocumentURL:=nil;
  426. end;
  427. function THTMLLinkScanFileCollection.At(Index: sw_Integer): PHTMLLinkScanFile;
  428. begin
  429. At:=inherited At(Index);
  430. end;
  431. function THTMLLinkScanFileCollection.Compare(Key1, Key2: Pointer): sw_Integer;
  432. var R: integer;
  433. K1: PHTMLLinkScanFile absolute Key1;
  434. K2: PHTMLLinkScanFile absolute Key2;
  435. S1,S2: string;
  436. begin
  437. S1:=UpcaseStr(K1^.GetDocumentURL); S2:=UpcaseStr(K2^.GetDocumentURL);
  438. if S1<S2 then R:=-1 else
  439. if S1>S2 then R:= 1 else
  440. R:=0;
  441. Compare:=R;
  442. end;
  443. function THTMLLinkScanFileCollection.SearchFile(const DocURL: string): PHTMLLinkScanFile;
  444. var P,D: PHTMLLinkScanFile;
  445. Index: sw_integer;
  446. begin
  447. New(D, Init(DocURL));
  448. if Search(D,Index)=false then P:=nil else
  449. P:=At(Index);
  450. Dispose(D, Done);
  451. SearchFile:=P;
  452. end;
  453. function THTMLLinkScanFileCollection.FindFileWithState(AState: THTMLLinkScanState): PHTMLLinkScanFile;
  454. var I: sw_integer;
  455. P,D: PHTMLLinkScanFile;
  456. begin
  457. P:=nil;
  458. for I:=0 to Count-1 do
  459. begin
  460. D:=At(I);
  461. if D^.State=AState then
  462. begin
  463. P:=D;
  464. Break;
  465. end;
  466. end;
  467. FindFileWithState:=P;
  468. end;
  469. constructor THTMLFileLinkScanner.Init(const ABaseDir: string);
  470. begin
  471. inherited Init(ABaseDir);
  472. New(DocumentFiles, Init(50,100));
  473. end;
  474. procedure THTMLFileLinkScanner.ProcessDocument(const DocumentURL: string; AOptions: THTMLLinkScanOptions);
  475. var P: PHTMLLinkScanFile;
  476. begin
  477. CurBaseURL:=''; Options:=AOptions;
  478. ScheduleDoc(DocumentURL);
  479. repeat
  480. P:=DocumentFiles^.FindFileWithState(ssScheduled);
  481. if Assigned(P) then
  482. ProcessDoc(P);
  483. until P=nil;
  484. end;
  485. function THTMLFileLinkScanner.GetDocumentBaseURL: string;
  486. begin
  487. GetDocumentBaseURL:=CurBaseURL;
  488. end;
  489. function THTMLFileLinkScanner.CheckURL(const URL: string): boolean;
  490. var OK: boolean;
  491. begin
  492. if soSubDocsOnly in Options then
  493. OK:=UpcaseStr(copy(URL,1,length(BaseURL)))=UpcaseStr(BaseURL)
  494. else
  495. OK:=true;
  496. CheckURL:=OK;
  497. end;
  498. procedure THTMLFileLinkScanner.AddLink(const LinkText, LinkURL: string);
  499. var D: PHTMLLinkScanFile;
  500. P: sw_integer;
  501. DocURL: string;
  502. begin
  503. P:=Pos('#',LinkURL);
  504. if P=0 then DocURL:=LinkURL else DocURL:=copy(LinkURL,1,P-1);
  505. D:=DocumentFiles^.SearchFile(DocURL);
  506. if Assigned(D)=false then
  507. ScheduleDoc(DocURL);
  508. inherited AddLink(LinkText,LinkURL);
  509. end;
  510. procedure THTMLFileLinkScanner.ProcessDoc(Doc: PHTMLLinkScanFile);
  511. var F: PDOSTextFile;
  512. begin
  513. if Assigned(Doc)=false then Exit;
  514. Doc^.State:=ssProcessing;
  515. CurDoc:=Doc^.GetDocumentURL;
  516. New(F, Init(Doc^.GetDocumentURL));
  517. if Assigned(F) then
  518. begin
  519. CurBaseURL:=CompleteURL(Doc^.GetDocumentURL,'');
  520. Process(F);
  521. Dispose(F, Done);
  522. end;
  523. Doc^.State:=ssScanned;
  524. CurDoc:='';
  525. end;
  526. procedure THTMLFileLinkScanner.ScheduleDoc(const DocumentURL: string);
  527. var D: PHTMLLinkScanFile;
  528. begin
  529. New(D, Init(DocumentURL));
  530. D^.State:=ssScheduled;
  531. DocumentFiles^.Insert(D);
  532. end;
  533. destructor THTMLFileLinkScanner.Done;
  534. begin
  535. inherited Done;
  536. if Assigned(DocumentFiles) then Dispose(DocumentFiles, Done); DocumentFiles:=nil;
  537. end;
  538. procedure RegisterWHTMLScan;
  539. begin
  540. RegisterType(RHTMLLinkScanDocument);
  541. end;
  542. END.