sax_html.pp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647
  1. {
  2. This file is part of the Free Component Library
  3. HTML parser with SAX-like interface
  4. Copyright (c) 2000-2002 by
  5. Areca Systems GmbH / Sebastian Guenther, [email protected]
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {
  13. Known problems:
  14. * The whitespace handling does only work for processing the DOM tree.
  15. Storing the DOM tree to a XML file will result in a quite ugly file.
  16. (This probably has got much better with recent versions, which do
  17. decent whitespace converting, but it's not tested really good.)
  18. * Entity references in attribute values don't get parsed.
  19. }
  20. unit SAX_HTML;
  21. interface
  22. uses SysUtils, Classes, SAX, DOM, DOM_HTML,htmldefs;
  23. type
  24. { THTMLReader: The HTML reader class }
  25. THTMLScannerContext = (
  26. scUnknown,
  27. scWhitespace, // within whitespace
  28. scText, // within text
  29. scEntityReference, // within entity reference ("&...;")
  30. scTag); // within a start tag or end tag
  31. THTMLReader = class(TSAXReader)
  32. private
  33. FStarted: Boolean;
  34. FEndOfStream: Boolean;
  35. FScannerContext: THTMLScannerContext;
  36. FTokenText: SAXString;
  37. FCurStringValueDelimiter: Char;
  38. FAttrNameRead: Boolean;
  39. protected
  40. procedure EnterNewScannerContext(NewContext: THTMLScannerContext);
  41. public
  42. constructor Create;
  43. destructor Destroy; override;
  44. procedure Parse(AInput: TSAXInputSource); override; overload;
  45. property EndOfStream: Boolean read FEndOfStream;
  46. property ScannerContext: THTMLScannerContext read FScannerContext;
  47. property TokenText: SAXString read FTokenText;
  48. end;
  49. { THTMLToDOMConverter }
  50. THTMLNodeType = (ntWhitespace, ntText, ntEntityReference, ntTag);
  51. THTMLNodeInfo = class
  52. NodeType: THTMLNodeType;
  53. DOMNode: TDOMNode;
  54. end;
  55. THTMLToDOMConverter = class
  56. private
  57. FReader: THTMLReader;
  58. FDocument: TDOMDocument;
  59. FElementStack: TList;
  60. FNodeBuffer: TList;
  61. IsFragmentMode, FragmentRootSet: Boolean;
  62. FragmentRoot: TDOMNode;
  63. procedure ReaderCharacters(Sender: TObject; const ch: PSAXChar;
  64. Start, Count: Integer);
  65. procedure ReaderIgnorableWhitespace(Sender: TObject; const ch: PSAXChar;
  66. Start, Count: Integer);
  67. procedure ReaderSkippedEntity(Sender: TObject; const Name: SAXString);
  68. procedure ReaderStartElement(Sender: TObject;
  69. const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
  70. procedure ReaderEndElement(Sender: TObject;
  71. const NamespaceURI, LocalName, RawName: SAXString);
  72. public
  73. constructor Create(AReader: THTMLReader; ADocument: TDOMDocument);
  74. constructor CreateFragment(AReader: THTMLReader; AFragmentRoot: TDOMNode);
  75. destructor Destroy; override;
  76. end;
  77. // Helper functions; these ones are HTML equivalents of ReadXML[File|Fragment]
  78. procedure ReadHTMLFile(var ADoc: THTMLDocument; const AFilename: String);
  79. procedure ReadHTMLFile(var ADoc: THTMLDocument; var f: TStream);
  80. procedure ReadHTMLFragment(AParentNode: TDOMNode; const AFilename: String);
  81. procedure ReadHTMLFragment(AParentNode: TDOMNode; var f: TStream);
  82. implementation
  83. const
  84. WhitespaceChars = [#9, #10, #13, ' '];
  85. constructor THTMLReader.Create;
  86. begin
  87. inherited Create;
  88. FScannerContext := scUnknown;
  89. end;
  90. destructor THTMLReader.Destroy;
  91. begin
  92. if FStarted then
  93. DoEndDocument;
  94. inherited Destroy;
  95. end;
  96. procedure THTMLReader.Parse(AInput: TSAXInputSource);
  97. const
  98. MaxBufferSize = 1024;
  99. var
  100. Buffer: array[0..MaxBufferSize - 1] of Char;
  101. BufferSize, BufferPos: Integer;
  102. begin
  103. if not FStarted then
  104. begin
  105. FStarted := True;
  106. DoStartDocument;
  107. end;
  108. FEndOfStream := False;
  109. while True do
  110. begin
  111. // Read data into the input buffer
  112. BufferSize := AInput.Stream.Read(Buffer, MaxBufferSize);
  113. if BufferSize = 0 then
  114. begin
  115. FEndOfStream := True;
  116. break;
  117. end;
  118. BufferPos := 0;
  119. while BufferPos < BufferSize do
  120. case ScannerContext of
  121. scUnknown:
  122. case Buffer[BufferPos] of
  123. #9, #10, #13, ' ':
  124. EnterNewScannerContext(scWhitespace);
  125. '&':
  126. begin
  127. Inc(BufferPos);
  128. EnterNewScannerContext(scEntityReference);
  129. end;
  130. '<':
  131. begin
  132. Inc(BufferPos);
  133. EnterNewScannerContext(scTag);
  134. end;
  135. else
  136. EnterNewScannerContext(scText);
  137. end;
  138. scWhitespace:
  139. case Buffer[BufferPos] of
  140. #9, #10, #13, ' ':
  141. begin
  142. FTokenText := FTokenText + Buffer[BufferPos];
  143. Inc(BufferPos);
  144. end;
  145. '&':
  146. begin
  147. Inc(BufferPos);
  148. EnterNewScannerContext(scEntityReference);
  149. end;
  150. '<':
  151. begin
  152. Inc(BufferPos);
  153. EnterNewScannerContext(scTag);
  154. end;
  155. else
  156. EnterNewScannerContext(scText);
  157. end;
  158. scText:
  159. case Buffer[BufferPos] of
  160. #9, #10, #13, ' ':
  161. EnterNewScannerContext(scWhitespace);
  162. '&':
  163. begin
  164. Inc(BufferPos);
  165. EnterNewScannerContext(scEntityReference);
  166. end;
  167. '<':
  168. begin
  169. Inc(BufferPos);
  170. EnterNewScannerContext(scTag);
  171. end;
  172. else
  173. begin
  174. FTokenText := FTokenText + Buffer[BufferPos];
  175. Inc(BufferPos);
  176. end;
  177. end;
  178. scEntityReference:
  179. if Buffer[BufferPos] = ';' then
  180. begin
  181. Inc(BufferPos);
  182. EnterNewScannerContext(scUnknown);
  183. end else if not (Buffer[BufferPos] in
  184. ['a'..'z', 'A'..'Z', '0'..'9', '#']) then
  185. EnterNewScannerContext(scUnknown)
  186. else
  187. begin
  188. FTokenText := FTokenText + Buffer[BufferPos];
  189. Inc(BufferPos);
  190. end;
  191. scTag:
  192. case Buffer[BufferPos] of
  193. '''', '"':
  194. begin
  195. if FAttrNameRead then
  196. begin
  197. if FCurStringValueDelimiter = #0 then
  198. FCurStringValueDelimiter := Buffer[BufferPos]
  199. else if FCurStringValueDelimiter = Buffer[BufferPos] then
  200. begin
  201. FCurStringValueDelimiter := #0;
  202. FAttrNameRead := False;
  203. end;
  204. end;
  205. FTokenText := FTokenText + Buffer[BufferPos];
  206. Inc(BufferPos);
  207. end;
  208. '=':
  209. begin
  210. FAttrNameRead := True;
  211. FTokenText := FTokenText + Buffer[BufferPos];
  212. Inc(BufferPos);
  213. end;
  214. '>':
  215. begin
  216. Inc(BufferPos);
  217. if FCurStringValueDelimiter = #0 then
  218. EnterNewScannerContext(scUnknown);
  219. end;
  220. else
  221. begin
  222. FTokenText := FTokenText + Buffer[BufferPos];
  223. Inc(BufferPos);
  224. end;
  225. end;
  226. end;
  227. end;
  228. end;
  229. procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
  230. function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
  231. var
  232. i, j: Integer;
  233. AttrName: String;
  234. ValueDelimiter: Char;
  235. DoIncJ: Boolean;
  236. begin
  237. Attr := nil;
  238. i := Pos(' ', s);
  239. if i <= 0 then
  240. Result := LowerCase(s)
  241. else
  242. begin
  243. Result := LowerCase(Copy(s, 1, i - 1));
  244. Attr := TSAXAttributes.Create;
  245. Inc(i);
  246. while (i <= Length(s)) and (s[i] in WhitespaceChars) do
  247. Inc(i);
  248. SetLength(AttrName, 0);
  249. j := i;
  250. while j <= Length(s) do
  251. if s[j] = '=' then
  252. begin
  253. AttrName := LowerCase(Copy(s, i, j - i));
  254. Inc(j);
  255. if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
  256. begin
  257. ValueDelimiter := s[j];
  258. Inc(j);
  259. end else
  260. ValueDelimiter := #0;
  261. i := j;
  262. DoIncJ := False;
  263. while j <= Length(s) do
  264. if ValueDelimiter = #0 then
  265. if s[j] in WhitespaceChars then
  266. break
  267. else
  268. Inc(j)
  269. else if s[j] = ValueDelimiter then
  270. begin
  271. DoIncJ := True;
  272. break
  273. end else
  274. Inc(j);
  275. Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
  276. if DoIncJ then
  277. Inc(j);
  278. while (j <= Length(s)) and (s[j] in WhitespaceChars) do
  279. Inc(j);
  280. i := j;
  281. end
  282. else if s[j] in WhitespaceChars then
  283. begin
  284. Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
  285. Inc(j);
  286. while (j <= Length(s)) and (s[j] in WhitespaceChars) do
  287. Inc(j);
  288. i := j;
  289. end else
  290. Inc(j);
  291. end;
  292. end;
  293. var
  294. Attr: TSAXAttributes;
  295. EntString, TagName: String;
  296. Found: Boolean;
  297. Ent: Char;
  298. i: Integer;
  299. begin
  300. case ScannerContext of
  301. scWhitespace:
  302. DoIgnorableWhitespace(PSAXChar(TokenText), 1, Length(TokenText));
  303. scText:
  304. DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
  305. scEntityReference:
  306. begin
  307. if ResolveHTMLEntityReference(TokenText, Ent) then
  308. begin
  309. EntString := Ent;
  310. DoCharacters(PSAXChar(EntString), 0, 1);
  311. end else
  312. begin
  313. { Is this a predefined Unicode character entity? We must check this,
  314. as undefined entities must be handled as text, for compatiblity
  315. to popular browsers... }
  316. Found := False;
  317. for i := Low(UnicodeHTMLEntities) to High(UnicodeHTMLEntities) do
  318. if UnicodeHTMLEntities[i] = TokenText then
  319. begin
  320. Found := True;
  321. break;
  322. end;
  323. if Found then
  324. DoSkippedEntity(TokenText)
  325. else
  326. DoCharacters(PSAXChar('&' + TokenText), 0, Length(TokenText) + 1);
  327. end;
  328. end;
  329. scTag:
  330. if Length(TokenText) > 0 then
  331. begin
  332. Attr := nil;
  333. if TokenText[1] = '/' then
  334. begin
  335. DoEndElement('',
  336. SplitTagString(Copy(TokenText, 2, Length(TokenText)), Attr), '');
  337. end else if TokenText[1] <> '!' then
  338. begin
  339. // Do NOT combine to a single line, as Attr is an output value!
  340. TagName := SplitTagString(TokenText, Attr);
  341. DoStartElement('', TagName, '', Attr);
  342. end;
  343. if Assigned(Attr) then
  344. Attr.Free;
  345. end;
  346. end;
  347. FScannerContext := NewContext;
  348. SetLength(FTokenText, 0);
  349. FCurStringValueDelimiter := #0;
  350. FAttrNameRead := False;
  351. end;
  352. { THTMLToDOMConverter }
  353. constructor THTMLToDOMConverter.Create(AReader: THTMLReader;
  354. ADocument: TDOMDocument);
  355. begin
  356. inherited Create;
  357. FReader := AReader;
  358. FReader.OnCharacters := @ReaderCharacters;
  359. FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
  360. FReader.OnSkippedEntity := @ReaderSkippedEntity;
  361. FReader.OnStartElement := @ReaderStartElement;
  362. FReader.OnEndElement := @ReaderEndElement;
  363. FDocument := ADocument;
  364. FElementStack := TList.Create;
  365. FNodeBuffer := TList.Create;
  366. end;
  367. constructor THTMLToDOMConverter.CreateFragment(AReader: THTMLReader;
  368. AFragmentRoot: TDOMNode);
  369. begin
  370. inherited Create;
  371. FReader := AReader;
  372. FReader.OnCharacters := @ReaderCharacters;
  373. FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
  374. FReader.OnSkippedEntity := @ReaderSkippedEntity;
  375. FReader.OnStartElement := @ReaderStartElement;
  376. FReader.OnEndElement := @ReaderEndElement;
  377. FDocument := AFragmentRoot.OwnerDocument;
  378. FElementStack := TList.Create;
  379. FNodeBuffer := TList.Create;
  380. FragmentRoot := AFragmentRoot;
  381. IsFragmentMode := True;
  382. end;
  383. destructor THTMLToDOMConverter.Destroy;
  384. var
  385. i: Integer;
  386. begin
  387. // Theoretically, always exactly one item will remain - the root element:
  388. for i := 0 to FNodeBuffer.Count - 1 do
  389. THTMLNodeInfo(FNodeBuffer[i]).Free;
  390. FNodeBuffer.Free;
  391. FElementStack.Free;
  392. inherited Destroy;
  393. end;
  394. procedure THTMLToDOMConverter.ReaderCharacters(Sender: TObject;
  395. const ch: PSAXChar; Start, Count: Integer);
  396. var
  397. s: SAXString;
  398. NodeInfo: THTMLNodeInfo;
  399. begin
  400. SetLength(s, Count);
  401. Move(ch^, s[1], Count * SizeOf(SAXChar));
  402. NodeInfo := THTMLNodeInfo.Create;
  403. NodeInfo.NodeType := ntText;
  404. NodeInfo.DOMNode := FDocument.CreateTextNode(s);
  405. FNodeBuffer.Add(NodeInfo);
  406. end;
  407. procedure THTMLToDOMConverter.ReaderIgnorableWhitespace(Sender: TObject;
  408. const ch: PSAXChar; Start, Count: Integer);
  409. var
  410. s: SAXString;
  411. NodeInfo: THTMLNodeInfo;
  412. begin
  413. SetLength(s, Count);
  414. Move(ch^, s[1], Count * SizeOf(SAXChar));
  415. NodeInfo := THTMLNodeInfo.Create;
  416. NodeInfo.NodeType := ntWhitespace;
  417. NodeInfo.DOMNode := FDocument.CreateTextNode(s);
  418. FNodeBuffer.Add(NodeInfo);
  419. end;
  420. procedure THTMLToDOMConverter.ReaderSkippedEntity(Sender: TObject;
  421. const Name: SAXString);
  422. var
  423. NodeInfo: THTMLNodeInfo;
  424. begin
  425. NodeInfo := THTMLNodeInfo.Create;
  426. NodeInfo.NodeType := ntEntityReference;
  427. NodeInfo.DOMNode := FDocument.CreateEntityReference(Name);
  428. FNodeBuffer.Add(NodeInfo);
  429. end;
  430. procedure THTMLToDOMConverter.ReaderStartElement(Sender: TObject;
  431. const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
  432. var
  433. NodeInfo: THTMLNodeInfo;
  434. Element: TDOMElement;
  435. i: Integer;
  436. begin
  437. // WriteLn('Start: ', LocalName, '. Node buffer before: ', FNodeBuffer.Count, ' elements');
  438. Element := FDocument.CreateElement(LocalName);
  439. if Assigned(Attr) then
  440. begin
  441. // WriteLn('Attribute: ', Attr.GetLength);
  442. for i := 0 to Attr.GetLength - 1 do
  443. begin
  444. // WriteLn('#', i, ': LocalName = ', Attr.GetLocalName(i), ', Value = ', Attr.GetValue(i));
  445. Element[Attr.GetLocalName(i)] := Attr.GetValue(i);
  446. end;
  447. end;
  448. NodeInfo := THTMLNodeInfo.Create;
  449. NodeInfo.NodeType := ntTag;
  450. NodeInfo.DOMNode := Element;
  451. if IsFragmentMode then
  452. begin
  453. if not FragmentRootSet then
  454. begin
  455. FragmentRoot.AppendChild(Element);
  456. FragmentRootSet := True;
  457. end;
  458. end else
  459. if not Assigned(FDocument.DocumentElement) then
  460. FDocument.AppendChild(Element);
  461. FNodeBuffer.Add(NodeInfo);
  462. // WriteLn('Start: ', LocalName, '. Node buffer after: ', FNodeBuffer.Count, ' elements');
  463. end;
  464. procedure THTMLToDOMConverter.ReaderEndElement(Sender: TObject;
  465. const NamespaceURI, LocalName, RawName: SAXString);
  466. var
  467. NodeInfo, NodeInfo2: THTMLNodeInfo;
  468. i : Integer;
  469. j : THTMLElementTag;
  470. TagInfo: PHTMLElementProps;
  471. begin
  472. // WriteLn('End: ', LocalName, '. Node buffer: ', FNodeBuffer.Count, ' elements');
  473. // Find the matching start tag
  474. i := FNodeBuffer.Count - 1;
  475. while i >= 0 do
  476. begin
  477. NodeInfo := THTMLNodeInfo(FNodeBuffer.Items[i]);
  478. if (NodeInfo.NodeType = ntTag) and
  479. (CompareText(NodeInfo.DOMNode.NodeName, LocalName) = 0) then
  480. begin
  481. // We found the matching start tag
  482. TagInfo := nil;
  483. for j := Low(THTMLElementTag) to High(THTMLElementTag) do
  484. if CompareText(HTMLElementProps[j].Name, LocalName) = 0 then
  485. begin
  486. TagInfo := @HTMLElementProps[j];
  487. break;
  488. end;
  489. Inc(i);
  490. while i < FNodeBuffer.Count do
  491. begin
  492. NodeInfo2 := THTMLNodeInfo(FNodeBuffer.Items[i]);
  493. if (NodeInfo2.NodeType = ntWhitespace) and Assigned(TagInfo) and
  494. (not (efPreserveWhitespace in TagInfo^.Flags)) then
  495. // Handle whitespace, which doesn't need to get preserved...
  496. if not (efPCDATAContent in TagInfo^.Flags) then
  497. // No character data allowed within the current element
  498. NodeInfo2.DOMNode.Free
  499. else
  500. begin
  501. // Character data allowed, so normalize it
  502. NodeInfo2.DOMNode.NodeValue := ' ';
  503. NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode)
  504. end
  505. else
  506. NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode);
  507. NodeInfo2.Free;
  508. FNodeBuffer.Delete(i);
  509. end;
  510. break;
  511. end;
  512. Dec(i);
  513. end;
  514. end;
  515. procedure ReadHTMLFile(var ADoc: THTMLDocument; const AFilename: String);
  516. var
  517. f: TStream;
  518. begin
  519. ADoc := nil;
  520. f := TFileStream.Create(AFilename, fmOpenRead);
  521. try
  522. ReadHTMLFile(ADoc, f);
  523. finally
  524. f.Free;
  525. end;
  526. end;
  527. procedure ReadHTMLFile(var ADoc: THTMLDocument; var f: TStream);
  528. var
  529. Reader: THTMLReader;
  530. Converter: THTMLToDOMConverter;
  531. begin
  532. ADoc := THTMLDocument.Create;
  533. Reader := THTMLReader.Create;
  534. try
  535. Converter := THTMLToDOMConverter.Create(Reader, ADoc);
  536. try
  537. Reader.ParseStream(f);
  538. finally
  539. Converter.Free;
  540. end;
  541. finally
  542. Reader.Free;
  543. end;
  544. end;
  545. procedure ReadHTMLFragment(AParentNode: TDOMNode; const AFilename: String);
  546. var
  547. f: TStream;
  548. begin
  549. f := TFileStream.Create(AFilename, fmOpenRead);
  550. try
  551. ReadHTMLFragment(AParentNode, f);
  552. finally
  553. f.Free;
  554. end;
  555. end;
  556. procedure ReadHTMLFragment(AParentNode: TDOMNode; var f: TStream);
  557. var
  558. Reader: THTMLReader;
  559. Converter: THTMLToDOMConverter;
  560. begin
  561. Reader := THTMLReader.Create;
  562. try
  563. Converter := THTMLToDOMConverter.CreateFragment(Reader, AParentNode);
  564. try
  565. Reader.ParseStream(f);
  566. finally
  567. Converter.Free;
  568. end;
  569. finally
  570. Reader.Free;
  571. end;
  572. end;
  573. end.