sax_html.pp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762
  1. {
  2. This file is part of the Free Component Library
  3. HTML parser with SAX-like interface
  4. Copyright (c) 2000-2002 by
  5. Areca Systems GmbH / Sebastian Guenther, [email protected]
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {
  13. Known problems:
  14. * The whitespace handling does only work for processing the DOM tree.
  15. Storing the DOM tree to a XML file will result in a quite ugly file.
  16. (This probably has got much better with recent versions, which do
  17. decent whitespace converting, but it's not tested really good.)
  18. * Entity references in attribute values don't get parsed.
  19. }
  20. {$mode objfpc}
  21. {$H+}
  22. unit SAX_HTML;
  23. interface
  24. uses SysUtils, Classes, SAX, DOM, DOM_HTML,htmldefs,xmlutils;
  25. type
  26. { THTMLReader: The HTML reader class }
  27. THTMLScannerContext = (
  28. scUnknown,
  29. scWhitespace, // within whitespace
  30. scText, // within text
  31. scEntityReference, // within entity reference ("&...;")
  32. scTag); // within a start tag or end tag
  33. THTMLReader = class(TSAXReader)
  34. private
  35. FStarted: Boolean;
  36. FEndOfStream: Boolean;
  37. FScannerContext: THTMLScannerContext;
  38. FTokenText: SAXString;
  39. FRawTokenText: string;
  40. FCurStringValueDelimiter: Char;
  41. FAttrNameRead: Boolean;
  42. FStack: array of THTMLElementTag;
  43. FNesting: Integer;
  44. procedure AutoClose(const aName: SAXString);
  45. procedure NamePush(const aName: SAXString);
  46. procedure NamePop;
  47. protected
  48. procedure EnterNewScannerContext(NewContext: THTMLScannerContext);
  49. public
  50. constructor Create;
  51. destructor Destroy; override;
  52. procedure Parse(AInput: TSAXInputSource); override; overload;
  53. property EndOfStream: Boolean read FEndOfStream;
  54. property ScannerContext: THTMLScannerContext read FScannerContext;
  55. property TokenText: SAXString read FTokenText;
  56. end;
  57. { THTMLToDOMConverter }
  58. THTMLNodeType = (ntWhitespace, ntText, ntEntityReference, ntTag);
  59. THTMLNodeInfo = class
  60. NodeType: THTMLNodeType;
  61. DOMNode: TDOMNode;
  62. end;
  63. THTMLToDOMConverter = class
  64. private
  65. FReader: THTMLReader;
  66. FDocument: TDOMDocument;
  67. FElementStack: TList;
  68. FNodeBuffer: TList;
  69. IsFragmentMode, FragmentRootSet: Boolean;
  70. FragmentRoot: TDOMNode;
  71. procedure ReaderCharacters(Sender: TObject; const ch: PSAXChar;
  72. Start, Count: Integer);
  73. procedure ReaderIgnorableWhitespace(Sender: TObject; const ch: PSAXChar;
  74. Start, Count: Integer);
  75. procedure ReaderStartElement(Sender: TObject;
  76. const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
  77. procedure ReaderEndElement(Sender: TObject;
  78. const NamespaceURI, LocalName, RawName: SAXString);
  79. public
  80. constructor Create(AReader: THTMLReader; ADocument: TDOMDocument);
  81. constructor CreateFragment(AReader: THTMLReader; AFragmentRoot: TDOMNode);
  82. destructor Destroy; override;
  83. end;
  84. // Helper functions; these ones are HTML equivalents of ReadXML[File|Fragment]
  85. procedure ReadHTMLFile(out ADoc: THTMLDocument; const AFilename: String);
  86. procedure ReadHTMLFile(out ADoc: THTMLDocument; f: TStream);
  87. procedure ReadHTMLFragment(AParentNode: TDOMNode; const AFilename: String);
  88. procedure ReadHTMLFragment(AParentNode: TDOMNode; f: TStream);
  89. implementation
  90. const
  91. WhitespaceChars = [#9, #10, #13, ' '];
  92. constructor THTMLReader.Create;
  93. begin
  94. inherited Create;
  95. FScannerContext := scUnknown;
  96. SetLength(FStack, 16);
  97. end;
  98. destructor THTMLReader.Destroy;
  99. begin
  100. if FStarted then
  101. DoEndDocument;
  102. inherited Destroy;
  103. end;
  104. function CheckForName(const Tag: SAXString): Boolean;
  105. var
  106. p, p1: PSAXChar;
  107. begin
  108. p := PSAXChar(Tag);
  109. result := False;
  110. if p^ <> '!' then
  111. begin
  112. if p^ = '/' then Inc(p);
  113. p1 := p;
  114. while (p1^ <> #0) and (p1^ <> '/') and not IsXMLWhitespace(p1^) do
  115. Inc(p1);
  116. result := IsXMLName(p, p1-p);
  117. end;
  118. end;
  119. procedure THTMLReader.Parse(AInput: TSAXInputSource);
  120. const
  121. MaxBufferSize = 1024;
  122. var
  123. Buffer: array[0..MaxBufferSize - 1] of Char;
  124. BufferSize, BufferPos: Integer;
  125. begin
  126. if not FStarted then
  127. begin
  128. FStarted := True;
  129. DoStartDocument;
  130. end;
  131. FEndOfStream := False;
  132. FStopFlag := False;
  133. while not FStopFlag do
  134. begin
  135. // Read data into the input buffer
  136. BufferSize := AInput.Stream.Read(Buffer, MaxBufferSize);
  137. if BufferSize = 0 then
  138. begin
  139. FEndOfStream := True;
  140. break;
  141. end;
  142. BufferPos := 0;
  143. while (BufferPos < BufferSize) and not FStopFlag do
  144. begin
  145. case ScannerContext of
  146. scUnknown:
  147. case Buffer[BufferPos] of
  148. #9, #10, #13, ' ':
  149. EnterNewScannerContext(scWhitespace);
  150. '&':
  151. begin
  152. Inc(BufferPos);
  153. EnterNewScannerContext(scEntityReference);
  154. end;
  155. '<':
  156. begin
  157. Inc(BufferPos);
  158. EnterNewScannerContext(scTag);
  159. end;
  160. else
  161. EnterNewScannerContext(scText);
  162. end;
  163. scWhitespace:
  164. case Buffer[BufferPos] of
  165. #9, #10, #13, ' ':
  166. begin
  167. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  168. Inc(BufferPos);
  169. end;
  170. '&':
  171. begin
  172. Inc(BufferPos);
  173. EnterNewScannerContext(scEntityReference);
  174. end;
  175. '<':
  176. begin
  177. Inc(BufferPos);
  178. EnterNewScannerContext(scTag);
  179. end;
  180. else
  181. FScannerContext := scText;
  182. end;
  183. scText:
  184. case Buffer[BufferPos] of
  185. '&':
  186. begin
  187. Inc(BufferPos);
  188. EnterNewScannerContext(scEntityReference);
  189. end;
  190. '<':
  191. begin
  192. Inc(BufferPos);
  193. EnterNewScannerContext(scTag);
  194. end;
  195. else
  196. begin
  197. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  198. Inc(BufferPos);
  199. end;
  200. end;
  201. scEntityReference:
  202. if Buffer[BufferPos] = ';' then
  203. begin
  204. Inc(BufferPos);
  205. EnterNewScannerContext(scUnknown);
  206. end else if not (Buffer[BufferPos] in
  207. ['a'..'z', 'A'..'Z', '0'..'9', '#']) then
  208. EnterNewScannerContext(scUnknown)
  209. else
  210. begin
  211. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  212. Inc(BufferPos);
  213. end;
  214. scTag:
  215. case Buffer[BufferPos] of
  216. '''', '"':
  217. begin
  218. if FAttrNameRead then
  219. begin
  220. if FCurStringValueDelimiter = #0 then
  221. FCurStringValueDelimiter := Buffer[BufferPos]
  222. else if FCurStringValueDelimiter = Buffer[BufferPos] then
  223. begin
  224. FCurStringValueDelimiter := #0;
  225. FAttrNameRead := False;
  226. end;
  227. end;
  228. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  229. Inc(BufferPos);
  230. end;
  231. '=':
  232. begin
  233. FAttrNameRead := True;
  234. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  235. Inc(BufferPos);
  236. end;
  237. '>':
  238. begin
  239. Inc(BufferPos);
  240. if FCurStringValueDelimiter = #0 then
  241. EnterNewScannerContext(scUnknown);
  242. end;
  243. '<': // either an unclosed tag or unescaped '<' in text; attempt recovery
  244. begin
  245. // TODO: this check is hardly complete, probably must also check if
  246. // tag name is followed by legal attributes.
  247. if CheckForName(FRawTokenText) then { <-- ansi to wide conversion here }
  248. EnterNewScannerContext(scUnknown) // assume unclosed tag
  249. else if (FRawTokenText <> '') and (FRawTokenText[1] <> '!') then
  250. begin
  251. Insert('<', FRawTokenText, 1); // assume plaintext
  252. FScannerContext := scText;
  253. EnterNewScannerContext(scUnknown);
  254. end
  255. else
  256. begin // in comment, ignore
  257. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  258. Inc(BufferPos);
  259. end;
  260. end;
  261. else
  262. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  263. Inc(BufferPos);
  264. end;
  265. end; // case ScannerContext of
  266. end; // while not endOfBuffer
  267. end;
  268. end;
  269. function LookupTag(const aName: SAXString): THTMLElementTag;
  270. var
  271. j: THTMLElementTag;
  272. ansiName: string;
  273. begin
  274. ansiName := aName;
  275. for j := Low(THTMLElementTag) to High(THTMLElementTag) do
  276. if SameText(HTMLElementProps[j].Name, ansiName) then
  277. begin
  278. Result := j;
  279. Exit;
  280. end;
  281. Result := etUnknown;
  282. end;
  283. procedure THTMLReader.AutoClose(const aName: SAXString);
  284. var
  285. newTag: THTMLElementTag;
  286. begin
  287. newTag := LookupTag(aName);
  288. while (FNesting > 0) and IsAutoClose(newTag, FStack[FNesting-1]) do
  289. begin
  290. DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
  291. namePop;
  292. end;
  293. end;
  294. procedure THTMLReader.NamePush(const aName: SAXString);
  295. var
  296. tag: THTMLElementTag;
  297. begin
  298. tag := LookupTag(aName);
  299. if FNesting >= Length(FStack) then
  300. SetLength(FStack, FNesting * 2);
  301. FStack[FNesting] := tag;
  302. Inc(FNesting);
  303. end;
  304. procedure THTMLReader.NamePop;
  305. begin
  306. if FNesting <= 0 then
  307. Exit;
  308. Dec(FNesting);
  309. FStack[FNesting] := etUnknown;
  310. end;
  311. function SplitTagString(const s: SAXString; var Attr: TSAXAttributes): SAXString;
  312. var
  313. i, j: Integer;
  314. AttrName: SAXString;
  315. ValueDelimiter: WideChar;
  316. DoIncJ: Boolean;
  317. begin
  318. Attr := nil;
  319. i := 0;
  320. repeat
  321. Inc(i)
  322. until (i > Length(s)) or IsXMLWhitespace(s[i]);
  323. if i > Length(s) then
  324. Result := s
  325. else
  326. begin
  327. Result := Copy(s, 1, i - 1);
  328. Attr := TSAXAttributes.Create;
  329. Inc(i);
  330. while (i <= Length(s)) and IsXMLWhitespace(s[i]) do
  331. Inc(i);
  332. SetLength(AttrName, 0);
  333. j := i;
  334. while j <= Length(s) do
  335. if s[j] = '=' then
  336. begin
  337. AttrName := Copy(s, i, j - i);
  338. WStrLower(AttrName);
  339. Inc(j);
  340. if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
  341. begin
  342. ValueDelimiter := s[j];
  343. Inc(j);
  344. end else
  345. ValueDelimiter := #0;
  346. i := j;
  347. DoIncJ := False;
  348. while j <= Length(s) do
  349. if ValueDelimiter = #0 then
  350. if IsXMLWhitespace(s[j]) then
  351. break
  352. else
  353. Inc(j)
  354. else if s[j] = ValueDelimiter then
  355. begin
  356. DoIncJ := True;
  357. break
  358. end else
  359. Inc(j);
  360. if IsXMLName(AttrName) then
  361. Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
  362. if DoIncJ then
  363. Inc(j);
  364. while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
  365. Inc(j);
  366. i := j;
  367. end
  368. else if IsXMLWhitespace(s[j]) then
  369. begin
  370. if IsXMLName(@s[i], j-i) then
  371. Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
  372. Inc(j);
  373. while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
  374. Inc(j);
  375. i := j;
  376. end else
  377. Inc(j);
  378. end;
  379. WStrLower(result);
  380. end;
  381. function RightTrimmedLength(const s: SAXString): Integer;
  382. begin
  383. result := Length(s);
  384. while IsXmlWhitespace(s[result]) do Dec(result);
  385. end;
  386. function TagPos(elTag: THTMLElementTag; s: SAXString): Integer;
  387. begin
  388. WStrLower(s);
  389. Result := Pos(HTMLElementProps[elTag].Name, s);
  390. end;
  391. procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
  392. var
  393. Attr: TSAXAttributes;
  394. TagName: SAXString;
  395. Ent: SAXChar;
  396. i: Integer;
  397. elTag: THTMLElementTag;
  398. begin
  399. FTokenText := FRawTokenText;
  400. case ScannerContext of
  401. scWhitespace:
  402. if (FNesting > 0) and (efPCDataContent in HTMLElementProps[FStack[FNesting-1]].Flags) then
  403. DoCharacters(PSAXChar(TokenText), 0, Length(TokenText))
  404. else
  405. DoIgnorableWhitespace(PSAXChar(TokenText), 0, Length(TokenText));
  406. scText:
  407. DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
  408. scEntityReference:
  409. begin
  410. if ResolveHTMLEntityReference(TokenText, Ent) then
  411. DoCharacters(@Ent, 0, 1)
  412. else
  413. DoCharacters(PSAXChar('&' + TokenText + ';'), 0, Length(TokenText) + 2);
  414. end;
  415. scTag:
  416. if Length(TokenText) > 0 then
  417. begin
  418. { ignore possibly unescaped markup in SCRIPT and STYLE }
  419. if (FNesting > 0) and (FStack[FNesting-1] in [etScript,etStyle]) and
  420. not (
  421. (TokenText[1] = '/') and
  422. (RightTrimmedLength(TokenText)=Length(HTMLElementProps[FStack[FNesting-1]].Name)+1) and
  423. (TagPos(FStack[FNesting-1], TokenText) = 2)
  424. )
  425. and (TokenText[1] <> '!') then
  426. begin
  427. FTokenText := '<'+FTokenText+'>';
  428. DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
  429. end
  430. else
  431. begin
  432. Attr := nil;
  433. if TokenText[Length(fTokenText)]='/' then // handle xml/xhtml style empty tag
  434. begin
  435. setlength(fTokenText,length(fTokenText)-1);
  436. // Do NOT combine to a single line, as Attr is an output value!
  437. TagName := SplitTagString(TokenText, Attr);
  438. AutoClose(TagName);
  439. DoStartElement('', TagName, '', Attr);
  440. DoEndElement('', TagName, '');
  441. end
  442. else if TokenText[1] = '/' then
  443. begin
  444. Delete(FTokenText, 1, 1);
  445. TagName := SplitTagString(TokenText, Attr);
  446. elTag := LookupTag(TagName);
  447. i := FNesting-1;
  448. while (i >= 0) and (FStack[i] <> elTag) and
  449. (efEndTagOptional in HTMLElementProps[FStack[i]].Flags) do
  450. Dec(i);
  451. if (i>=0) and (FStack[i] = elTag) then
  452. while FStack[FNesting-1] <> elTag do
  453. begin
  454. DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
  455. namePop;
  456. end;
  457. DoEndElement('', TagName, '');
  458. namePop;
  459. end
  460. else if TokenText[1] <> '!' then
  461. begin
  462. // Do NOT combine to a single line, as Attr is an output value!
  463. TagName := SplitTagString(TokenText, Attr);
  464. AutoClose(TagName);
  465. namePush(TagName);
  466. DoStartElement('', TagName, '', Attr);
  467. end;
  468. if Assigned(Attr) then
  469. Attr.Free;
  470. end;
  471. end;
  472. end;
  473. FScannerContext := NewContext;
  474. FTokenText := '';
  475. FRawTokenText := '';
  476. FCurStringValueDelimiter := #0;
  477. FAttrNameRead := False;
  478. end;
  479. { THTMLToDOMConverter }
  480. constructor THTMLToDOMConverter.Create(AReader: THTMLReader;
  481. ADocument: TDOMDocument);
  482. begin
  483. inherited Create;
  484. FReader := AReader;
  485. FReader.OnCharacters := @ReaderCharacters;
  486. FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
  487. FReader.OnStartElement := @ReaderStartElement;
  488. FReader.OnEndElement := @ReaderEndElement;
  489. FDocument := ADocument;
  490. FElementStack := TList.Create;
  491. FNodeBuffer := TList.Create;
  492. end;
  493. constructor THTMLToDOMConverter.CreateFragment(AReader: THTMLReader;
  494. AFragmentRoot: TDOMNode);
  495. begin
  496. Create(AReader, AFragmentRoot.OwnerDocument);
  497. FragmentRoot := AFragmentRoot;
  498. IsFragmentMode := True;
  499. end;
  500. destructor THTMLToDOMConverter.Destroy;
  501. var
  502. i: Integer;
  503. begin
  504. // Theoretically, always exactly one item will remain - the root element:
  505. for i := 0 to FNodeBuffer.Count - 1 do
  506. THTMLNodeInfo(FNodeBuffer[i]).Free;
  507. FNodeBuffer.Free;
  508. FElementStack.Free;
  509. inherited Destroy;
  510. end;
  511. procedure THTMLToDOMConverter.ReaderCharacters(Sender: TObject;
  512. const ch: PSAXChar; Start, Count: Integer);
  513. var
  514. NodeInfo: THTMLNodeInfo;
  515. begin
  516. NodeInfo := THTMLNodeInfo.Create;
  517. NodeInfo.NodeType := ntText;
  518. NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
  519. FNodeBuffer.Add(NodeInfo);
  520. end;
  521. procedure THTMLToDOMConverter.ReaderIgnorableWhitespace(Sender: TObject;
  522. const ch: PSAXChar; Start, Count: Integer);
  523. var
  524. NodeInfo: THTMLNodeInfo;
  525. begin
  526. NodeInfo := THTMLNodeInfo.Create;
  527. NodeInfo.NodeType := ntWhitespace;
  528. NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
  529. FNodeBuffer.Add(NodeInfo);
  530. end;
  531. procedure THTMLToDOMConverter.ReaderStartElement(Sender: TObject;
  532. const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
  533. var
  534. NodeInfo: THTMLNodeInfo;
  535. Element: TDOMElement;
  536. i: Integer;
  537. begin
  538. {$ifdef SAX_HTML_DEBUG}
  539. WriteLn('Start: ', LocalName, '. Node buffer before: ', FNodeBuffer.Count, ' elements');
  540. {$endif}
  541. Element := FDocument.CreateElement(LocalName);
  542. if Assigned(Attr) then
  543. begin
  544. {$ifdef SAX_HTML_DEBUG}
  545. WriteLn('Attribute: ', Attr.GetLength);
  546. {$endif}
  547. for i := 0 to Attr.GetLength - 1 do
  548. begin
  549. {$ifdef SAX_HTML_DEBUG}
  550. WriteLn('#', i, ': LocalName = ', Attr.GetLocalName(i), ', Value = ', Attr.GetValue(i));
  551. {$endif}
  552. Element[Attr.GetLocalName(i)] := Attr.GetValue(i);
  553. end;
  554. end;
  555. NodeInfo := THTMLNodeInfo.Create;
  556. NodeInfo.NodeType := ntTag;
  557. NodeInfo.DOMNode := Element;
  558. if IsFragmentMode then
  559. begin
  560. if not FragmentRootSet then
  561. begin
  562. FragmentRoot.AppendChild(Element);
  563. FragmentRootSet := True;
  564. end;
  565. end else
  566. if not Assigned(FDocument.DocumentElement) then
  567. FDocument.AppendChild(Element);
  568. FNodeBuffer.Add(NodeInfo);
  569. {$ifdef SAX_HTML_DEBUG}
  570. WriteLn('Start: ', LocalName, '. Node buffer after: ', FNodeBuffer.Count, ' elements');
  571. {$endif}
  572. end;
  573. procedure THTMLToDOMConverter.ReaderEndElement(Sender: TObject;
  574. const NamespaceURI, LocalName, RawName: SAXString);
  575. var
  576. NodeInfo, NodeInfo2: THTMLNodeInfo;
  577. i : Integer;
  578. j : THTMLElementTag;
  579. TagInfo: PHTMLElementProps;
  580. begin
  581. {$ifdef SAX_HTML_DEBUG}
  582. WriteLn('End: ', LocalName, '. Node buffer: ', FNodeBuffer.Count, ' elements');
  583. {$endif}
  584. // Find the matching start tag
  585. i := FNodeBuffer.Count - 1;
  586. while i >= 0 do
  587. begin
  588. NodeInfo := THTMLNodeInfo(FNodeBuffer.Items[i]);
  589. if (NodeInfo.NodeType = ntTag) and
  590. (CompareText(NodeInfo.DOMNode.NodeName, LocalName) = 0) then
  591. begin
  592. // We found the matching start tag
  593. TagInfo := nil;
  594. for j := Low(THTMLElementTag) to High(THTMLElementTag) do
  595. if CompareText(HTMLElementProps[j].Name, LocalName) = 0 then
  596. begin
  597. TagInfo := @HTMLElementProps[j];
  598. break;
  599. end;
  600. Inc(i);
  601. while i < FNodeBuffer.Count do
  602. begin
  603. NodeInfo2 := THTMLNodeInfo(FNodeBuffer.Items[i]);
  604. if (NodeInfo2.NodeType = ntWhitespace) and Assigned(TagInfo) and
  605. (not (efPreserveWhitespace in TagInfo^.Flags)) then
  606. // Handle whitespace, which doesn't need to get preserved...
  607. if not (efPCDATAContent in TagInfo^.Flags) then
  608. // No character data allowed within the current element
  609. NodeInfo2.DOMNode.Free
  610. else
  611. begin
  612. // Character data allowed, so normalize it
  613. NodeInfo2.DOMNode.NodeValue := ' ';
  614. NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode)
  615. end
  616. else
  617. NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode);
  618. NodeInfo2.Free;
  619. FNodeBuffer.Delete(i);
  620. end;
  621. break;
  622. end;
  623. Dec(i);
  624. end;
  625. end;
  626. procedure ReadHTMLFile(out ADoc: THTMLDocument; const AFilename: String);
  627. var
  628. f: TStream;
  629. begin
  630. ADoc := nil;
  631. f := TFileStream.Create(AFilename, fmOpenRead);
  632. try
  633. ReadHTMLFile(ADoc, f);
  634. finally
  635. f.Free;
  636. end;
  637. end;
  638. procedure ReadHTMLFile(out ADoc: THTMLDocument; f: TStream);
  639. var
  640. Reader: THTMLReader;
  641. Converter: THTMLToDOMConverter;
  642. begin
  643. ADoc := THTMLDocument.Create;
  644. Reader := THTMLReader.Create;
  645. try
  646. Converter := THTMLToDOMConverter.Create(Reader, ADoc);
  647. try
  648. Reader.ParseStream(f);
  649. finally
  650. Converter.Free;
  651. end;
  652. finally
  653. Reader.Free;
  654. end;
  655. end;
  656. procedure ReadHTMLFragment(AParentNode: TDOMNode; const AFilename: String);
  657. var
  658. f: TStream;
  659. begin
  660. f := TFileStream.Create(AFilename, fmOpenRead);
  661. try
  662. ReadHTMLFragment(AParentNode, f);
  663. finally
  664. f.Free;
  665. end;
  666. end;
  667. procedure ReadHTMLFragment(AParentNode: TDOMNode; f: TStream);
  668. var
  669. Reader: THTMLReader;
  670. Converter: THTMLToDOMConverter;
  671. begin
  672. Reader := THTMLReader.Create;
  673. try
  674. Converter := THTMLToDOMConverter.CreateFragment(Reader, AParentNode);
  675. try
  676. Reader.ParseStream(f);
  677. finally
  678. Converter.Free;
  679. end;
  680. finally
  681. Reader.Free;
  682. end;
  683. end;
  684. end.