sax_html.pp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811
  1. {
  2. This file is part of the Free Component Library
  3. HTML parser with SAX-like interface
  4. Copyright (c) 2000-2002 by
  5. Areca Systems GmbH / Sebastian Guenther, [email protected]
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {
  13. Known problems:
  14. * The whitespace handling does only work for processing the DOM tree.
  15. Storing the DOM tree to a XML file will result in a quite ugly file.
  16. (This probably has got much better with recent versions, which do
  17. decent whitespace converting, but it's not tested really good.)
  18. * Entity references in attribute values don't get parsed.
  19. }
  20. {$mode objfpc}
  21. {$H+}
  22. unit SAX_HTML;
  23. interface
  24. uses SysUtils, Classes, SAX, DOM, DOM_HTML,htmldefs,xmlutils;
  25. type
  26. { THTMLReader: The HTML reader class }
  27. THTMLScannerContext = (
  28. scUnknown,
  29. scWhitespace, // within whitespace
  30. scText, // within text
  31. scEntityReference, // within entity reference ("&...;")
  32. scTag, // within a start tag or end tag
  33. scComment,
  34. scScript
  35. );
  36. THTMLReader = class(TSAXReader)
  37. private
  38. FStarted: Boolean;
  39. FEndOfStream: Boolean;
  40. FScannerContext: THTMLScannerContext;
  41. FTokenText: SAXString;
  42. FRawTokenText: string;
  43. FScriptEndTag: string;
  44. FScriptEndMatchPos: Integer;
  45. FCurStringValueDelimiter: Char;
  46. FAttrNameRead: Boolean;
  47. FStack: array of THTMLElementTag;
  48. FNesting: Integer;
  49. procedure AutoClose(const aName: SAXString);
  50. procedure NamePush(const aName: SAXString);
  51. procedure NamePop;
  52. protected
  53. procedure EnterNewScannerContext(NewContext: THTMLScannerContext);
  54. public
  55. constructor Create;
  56. destructor Destroy; override;
  57. procedure Parse(AInput: TSAXInputSource); override; overload;
  58. property EndOfStream: Boolean read FEndOfStream;
  59. property ScannerContext: THTMLScannerContext read FScannerContext;
  60. property TokenText: SAXString read FTokenText;
  61. end;
  62. { THTMLToDOMConverter }
  63. THTMLNodeType = (ntWhitespace, ntText, ntEntityReference, ntTag);
  64. THTMLNodeInfo = class
  65. NodeType: THTMLNodeType;
  66. Closed: Boolean;
  67. DOMNode: TDOMNode;
  68. end;
  69. THTMLToDOMConverter = class
  70. private
  71. FReader: THTMLReader;
  72. FDocument: TDOMDocument;
  73. FNodeBuffer: TList;
  74. IsFragmentMode, FragmentRootSet: Boolean;
  75. FragmentRoot: TDOMNode;
  76. procedure ReaderCharacters(Sender: TObject; const ch: PSAXChar;
  77. Start, Count: Integer);
  78. procedure ReaderIgnorableWhitespace(Sender: TObject; const ch: PSAXChar;
  79. Start, Count: Integer);
  80. procedure ReaderStartElement(Sender: TObject;
  81. const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
  82. procedure ReaderEndElement(Sender: TObject;
  83. const NamespaceURI, LocalName, RawName: SAXString);
  84. public
  85. constructor Create(AReader: THTMLReader; ADocument: TDOMDocument);
  86. constructor CreateFragment(AReader: THTMLReader; AFragmentRoot: TDOMNode);
  87. destructor Destroy; override;
  88. end;
  89. // Helper functions; these ones are HTML equivalents of ReadXML[File|Fragment]
  90. procedure ReadHTMLFile(out ADoc: THTMLDocument; const AFilename: String);
  91. procedure ReadHTMLFile(out ADoc: THTMLDocument; f: TStream);
  92. procedure ReadHTMLFragment(AParentNode: TDOMNode; const AFilename: String);
  93. procedure ReadHTMLFragment(AParentNode: TDOMNode; f: TStream);
  94. implementation
  95. constructor THTMLReader.Create;
  96. begin
  97. inherited Create;
  98. FScannerContext := scUnknown;
  99. SetLength(FStack, 16);
  100. end;
  101. destructor THTMLReader.Destroy;
  102. begin
  103. if FStarted then
  104. DoEndDocument;
  105. inherited Destroy;
  106. end;
  107. function CheckForName(const Tag: SAXString): Boolean;
  108. var
  109. p, p1: PSAXChar;
  110. begin
  111. p := PSAXChar(Tag);
  112. result := False;
  113. if p^ <> '!' then
  114. begin
  115. if p^ = '/' then Inc(p);
  116. p1 := p;
  117. while (p1^ <> #0) and (p1^ <> '/') and not IsXMLWhitespace(p1^) do
  118. Inc(p1);
  119. result := IsXMLName(p, p1-p);
  120. end;
  121. end;
  122. procedure THTMLReader.Parse(AInput: TSAXInputSource);
  123. const
  124. MaxBufferSize = 1024;
  125. var
  126. Buffer: array[0..MaxBufferSize - 1] of Char;
  127. BufferSize, BufferPos: Integer;
  128. len: Integer;
  129. ch: Char;
  130. begin
  131. if not FStarted then
  132. begin
  133. FStarted := True;
  134. DoStartDocument;
  135. end;
  136. FEndOfStream := False;
  137. FStopFlag := False;
  138. while not FStopFlag do
  139. begin
  140. // Read data into the input buffer
  141. BufferSize := AInput.Stream.Read(Buffer, MaxBufferSize);
  142. if BufferSize = 0 then
  143. begin
  144. FEndOfStream := True;
  145. break;
  146. end;
  147. BufferPos := 0;
  148. while (BufferPos < BufferSize) and not FStopFlag do
  149. begin
  150. case ScannerContext of
  151. scUnknown:
  152. case Buffer[BufferPos] of
  153. #9, #10, #13, ' ':
  154. EnterNewScannerContext(scWhitespace);
  155. '&':
  156. begin
  157. Inc(BufferPos);
  158. EnterNewScannerContext(scEntityReference);
  159. end;
  160. '<':
  161. begin
  162. Inc(BufferPos);
  163. EnterNewScannerContext(scTag);
  164. end;
  165. else
  166. EnterNewScannerContext(scText);
  167. end;
  168. scWhitespace:
  169. case Buffer[BufferPos] of
  170. #9, #10, #13, ' ':
  171. begin
  172. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  173. Inc(BufferPos);
  174. end;
  175. '&':
  176. begin
  177. Inc(BufferPos);
  178. EnterNewScannerContext(scEntityReference);
  179. end;
  180. '<':
  181. begin
  182. Inc(BufferPos);
  183. EnterNewScannerContext(scTag);
  184. end;
  185. else
  186. FScannerContext := scText;
  187. end;
  188. scText:
  189. case Buffer[BufferPos] of
  190. '&':
  191. begin
  192. Inc(BufferPos);
  193. EnterNewScannerContext(scEntityReference);
  194. end;
  195. '<':
  196. begin
  197. Inc(BufferPos);
  198. EnterNewScannerContext(scTag);
  199. end;
  200. else
  201. begin
  202. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  203. Inc(BufferPos);
  204. end;
  205. end;
  206. scEntityReference:
  207. if Buffer[BufferPos] = ';' then
  208. begin
  209. Inc(BufferPos);
  210. EnterNewScannerContext(scUnknown);
  211. end else if not (Buffer[BufferPos] in
  212. ['a'..'z', 'A'..'Z', '0'..'9', '#']) then
  213. EnterNewScannerContext(scUnknown)
  214. else
  215. begin
  216. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  217. Inc(BufferPos);
  218. end;
  219. scTag:
  220. case Buffer[BufferPos] of
  221. '''', '"':
  222. begin
  223. if FAttrNameRead then
  224. begin
  225. if FCurStringValueDelimiter = #0 then
  226. FCurStringValueDelimiter := Buffer[BufferPos]
  227. else if FCurStringValueDelimiter = Buffer[BufferPos] then
  228. begin
  229. FCurStringValueDelimiter := #0;
  230. FAttrNameRead := False;
  231. end;
  232. end;
  233. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  234. Inc(BufferPos);
  235. end;
  236. '=':
  237. begin
  238. FAttrNameRead := True;
  239. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  240. Inc(BufferPos);
  241. end;
  242. '>':
  243. begin
  244. Inc(BufferPos);
  245. if FCurStringValueDelimiter = #0 then
  246. EnterNewScannerContext(scUnknown);
  247. end;
  248. '<': // either an unclosed tag or unescaped '<' in text; attempt recovery
  249. begin
  250. // TODO: this check is hardly complete, probably must also check if
  251. // tag name is followed by legal attributes.
  252. if CheckForName(FRawTokenText) then { <-- ansi to wide conversion here }
  253. EnterNewScannerContext(scUnknown) // assume unclosed tag
  254. else if (FRawTokenText <> '') and (FRawTokenText[1] <> '!') then
  255. begin
  256. Insert('<', FRawTokenText, 1); // assume plaintext
  257. FScannerContext := scText;
  258. EnterNewScannerContext(scUnknown);
  259. end
  260. else
  261. begin // in comment, ignore
  262. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  263. Inc(BufferPos);
  264. end;
  265. end;
  266. else
  267. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  268. if FRawTokenText='!--' then
  269. begin
  270. FScannerContext := scComment;
  271. FRawTokenText := '';
  272. end;
  273. Inc(BufferPos);
  274. end;
  275. scComment:
  276. begin
  277. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  278. Inc(BufferPos);
  279. if (Buffer[BufferPos-1]='>') then
  280. begin
  281. len:=length(FRawTokenText);
  282. if (len>2) and (FRawTokenText[len-1]='-') and (FRawTokenText[len-2]='-') then
  283. begin
  284. Delete(FRawTokenText, Length(FRawTokenText)-2, MaxInt);
  285. EnterNewScannerContext(scUnknown);
  286. end;
  287. end;
  288. end;
  289. scScript:
  290. begin
  291. ch := Buffer[BufferPos];
  292. if FScriptEndMatchPos <= Length(FScriptEndTag) then
  293. begin
  294. if lowercase(ch) = FScriptEndTag[FScriptEndMatchPos] then
  295. Inc(FScriptEndMatchPos)
  296. else
  297. FScriptEndMatchPos := 1;
  298. FRawTokenText := FRawTokenText + ch;
  299. Inc(BufferPos);
  300. end
  301. else
  302. begin
  303. case ch of
  304. #9,#10,#13,' ':
  305. begin
  306. FRawTokenText := FRawTokenText + ch;
  307. Inc(BufferPos);
  308. Inc(FScriptEndMatchPos);
  309. end;
  310. '>':
  311. begin
  312. Inc(BufferPos);
  313. Delete(FRawTokenText, Length(FRawTokenText)-FScriptEndMatchPos+2, MaxInt);
  314. EnterNewScannerContext(scUnknown);
  315. end;
  316. else
  317. FRawTokenText := FRawTokenText + ch;
  318. Inc(BufferPos);
  319. FScriptEndMatchPos := 1;
  320. end;
  321. end;
  322. end;
  323. end; // case ScannerContext of
  324. end; // while not endOfBuffer
  325. end;
  326. end;
  327. function LookupTag(const aName: SAXString): THTMLElementTag;
  328. var
  329. j: THTMLElementTag;
  330. ansiName: string;
  331. begin
  332. ansiName := aName;
  333. for j := Low(THTMLElementTag) to High(THTMLElementTag) do
  334. if SameText(HTMLElementProps[j].Name, ansiName) then
  335. begin
  336. Result := j;
  337. Exit;
  338. end;
  339. Result := etUnknown;
  340. end;
  341. procedure THTMLReader.AutoClose(const aName: SAXString);
  342. var
  343. newTag: THTMLElementTag;
  344. begin
  345. newTag := LookupTag(aName);
  346. while (FNesting > 0) and IsAutoClose(newTag, FStack[FNesting-1]) do
  347. begin
  348. DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
  349. namePop;
  350. end;
  351. end;
  352. procedure THTMLReader.NamePush(const aName: SAXString);
  353. var
  354. tag: THTMLElementTag;
  355. begin
  356. tag := LookupTag(aName);
  357. if FNesting >= Length(FStack) then
  358. SetLength(FStack, FNesting * 2);
  359. FStack[FNesting] := tag;
  360. Inc(FNesting);
  361. end;
  362. procedure THTMLReader.NamePop;
  363. begin
  364. if FNesting <= 0 then
  365. Exit;
  366. Dec(FNesting);
  367. FStack[FNesting] := etUnknown;
  368. end;
  369. function SplitTagString(const s: SAXString; var Attr: TSAXAttributes): SAXString;
  370. var
  371. i, j, len: Integer;
  372. AttrName: SAXString;
  373. ValueDelimiter: WideChar;
  374. haseq, hasname: Boolean;
  375. begin
  376. Attr := nil;
  377. i := 0;
  378. len := Length(s);
  379. repeat
  380. Inc(i)
  381. until (i > len) or IsXMLWhitespace(s[i]);
  382. Result := Copy(s, 1, i - 1);
  383. WStrLower(Result);
  384. if i > len then
  385. exit;
  386. Attr := TSAXAttributes.Create;
  387. Inc(i);
  388. repeat
  389. while (i <= len) and IsXMLWhitespace(s[i]) do
  390. Inc(i);
  391. if (i > len) then
  392. break;
  393. j := i;
  394. haseq := false;
  395. hasname := false;
  396. ValueDelimiter := #0;
  397. // Attr Name
  398. while (j <= len) and not (IsXMLWhitespace(s[j]) or (s[j] = '=')) do
  399. Inc(j);
  400. // j points on =, whitespace or after s
  401. AttrName := Copy(s, i, j - i);
  402. WStrLower(AttrName);
  403. hasname := IsXMLName(AttrName);
  404. // Look for = and following whitespaces (maybe j already points on =)
  405. while (j <= len) and (IsXMLWhitespace(s[j]) or (s[j] = '=')) do
  406. begin
  407. if (s[j] = '=') then
  408. haseq := true;
  409. Inc(j);
  410. end;
  411. // Value (j points on first nonblank or after end)
  412. if haseq then
  413. begin
  414. if (j > len) then { terminal case <tag attr=> }
  415. begin
  416. if hasname then
  417. Attr.AddAttribute('', AttrName, '', '', '');
  418. break;
  419. end
  420. else
  421. begin
  422. if (s[j]='''') or (s[j]='"') then
  423. begin
  424. ValueDelimiter := s[j];
  425. Inc(j);
  426. end;
  427. i := j;
  428. while (j <= len) do
  429. begin
  430. if (s[j]=ValueDelimiter) then
  431. break;
  432. if (ValueDelimiter=#0) and IsXMLWhitespace(s[j]) then
  433. break;
  434. Inc(j);
  435. end;
  436. if hasname then
  437. Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i))
  438. end;
  439. end
  440. else if hasname then { html boolean-style attribute }
  441. Attr.AddAttribute('', AttrName, '', '', AttrName);
  442. { skip closing quote if one is present }
  443. i := j + ord(ValueDelimiter<>#0);
  444. until false;
  445. end;
  446. procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
  447. var
  448. Attr: TSAXAttributes;
  449. TagName: SAXString;
  450. Ent: SAXChar;
  451. i: Integer;
  452. elTag: THTMLElementTag;
  453. begin
  454. FTokenText := FRawTokenText;
  455. case ScannerContext of
  456. scWhitespace:
  457. if (FNesting > 0) and (efPCDataContent in HTMLElementProps[FStack[FNesting-1]].Flags) then
  458. DoCharacters(PSAXChar(TokenText), 0, Length(TokenText))
  459. else
  460. DoIgnorableWhitespace(PSAXChar(TokenText), 0, Length(TokenText));
  461. scText:
  462. DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
  463. scEntityReference:
  464. begin
  465. if ResolveHTMLEntityReference(TokenText, Ent) then
  466. DoCharacters(@Ent, 0, 1)
  467. else
  468. DoCharacters(PSAXChar('&' + TokenText + ';'), 0, Length(TokenText) + 2);
  469. end;
  470. scTag:
  471. if Length(TokenText) > 0 then
  472. begin
  473. Attr := nil;
  474. if TokenText[Length(fTokenText)]='/' then // handle xml/xhtml style empty tag
  475. begin
  476. setlength(fTokenText,length(fTokenText)-1);
  477. // Do NOT combine to a single line, as Attr is an output value!
  478. TagName := SplitTagString(TokenText, Attr);
  479. AutoClose(TagName);
  480. DoStartElement('', TagName, '', Attr);
  481. DoEndElement('', TagName, '');
  482. end
  483. else if TokenText[1] = '/' then
  484. begin
  485. Delete(FTokenText, 1, 1);
  486. TagName := SplitTagString(TokenText, Attr);
  487. elTag := LookupTag(TagName);
  488. i := FNesting-1;
  489. while (i >= 0) and (FStack[i] <> elTag) and
  490. (efEndTagOptional in HTMLElementProps[FStack[i]].Flags) do
  491. Dec(i);
  492. if (i>=0) and (FStack[i] = elTag) then
  493. while FStack[FNesting-1] <> elTag do
  494. begin
  495. DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
  496. namePop;
  497. end;
  498. DoEndElement('', TagName, '');
  499. namePop;
  500. end
  501. else if TokenText[1] <> '!' then
  502. begin
  503. // Do NOT combine to a single line, as Attr is an output value!
  504. TagName := SplitTagString(TokenText, Attr);
  505. AutoClose(TagName);
  506. namePush(TagName);
  507. DoStartElement('', TagName, '', Attr);
  508. if FStack[FNesting-1] in [etScript,etStyle] then
  509. begin
  510. NewContext := scScript;
  511. FScriptEndTag := '</' + HTMLElementProps[FStack[FNesting-1]].Name;
  512. FScriptEndMatchPos := 1;
  513. end;
  514. end;
  515. if Assigned(Attr) then
  516. Attr.Free;
  517. end;
  518. scComment:
  519. begin
  520. DoComment(PSAXChar(TokenText), 0, Length(TokenText));
  521. end;
  522. scScript:
  523. begin
  524. DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
  525. DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
  526. namePop;
  527. FScriptEndTag := '';
  528. end;
  529. end;
  530. FScannerContext := NewContext;
  531. FTokenText := '';
  532. FRawTokenText := '';
  533. FCurStringValueDelimiter := #0;
  534. FAttrNameRead := False;
  535. end;
  536. { THTMLToDOMConverter }
  537. constructor THTMLToDOMConverter.Create(AReader: THTMLReader;
  538. ADocument: TDOMDocument);
  539. begin
  540. inherited Create;
  541. FReader := AReader;
  542. FReader.OnCharacters := @ReaderCharacters;
  543. FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
  544. FReader.OnStartElement := @ReaderStartElement;
  545. FReader.OnEndElement := @ReaderEndElement;
  546. FDocument := ADocument;
  547. FNodeBuffer := TList.Create;
  548. end;
  549. constructor THTMLToDOMConverter.CreateFragment(AReader: THTMLReader;
  550. AFragmentRoot: TDOMNode);
  551. begin
  552. Create(AReader, AFragmentRoot.OwnerDocument);
  553. FragmentRoot := AFragmentRoot;
  554. IsFragmentMode := True;
  555. end;
  556. destructor THTMLToDOMConverter.Destroy;
  557. var
  558. i: Integer;
  559. begin
  560. // Theoretically, always exactly one item will remain - the root element:
  561. for i := 0 to FNodeBuffer.Count - 1 do
  562. THTMLNodeInfo(FNodeBuffer[i]).Free;
  563. FNodeBuffer.Free;
  564. inherited Destroy;
  565. end;
  566. procedure THTMLToDOMConverter.ReaderCharacters(Sender: TObject;
  567. const ch: PSAXChar; Start, Count: Integer);
  568. var
  569. NodeInfo: THTMLNodeInfo;
  570. begin
  571. NodeInfo := THTMLNodeInfo.Create;
  572. NodeInfo.NodeType := ntText;
  573. NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
  574. FNodeBuffer.Add(NodeInfo);
  575. end;
  576. procedure THTMLToDOMConverter.ReaderIgnorableWhitespace(Sender: TObject;
  577. const ch: PSAXChar; Start, Count: Integer);
  578. var
  579. NodeInfo: THTMLNodeInfo;
  580. begin
  581. NodeInfo := THTMLNodeInfo.Create;
  582. NodeInfo.NodeType := ntWhitespace;
  583. NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
  584. FNodeBuffer.Add(NodeInfo);
  585. end;
  586. procedure THTMLToDOMConverter.ReaderStartElement(Sender: TObject;
  587. const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
  588. var
  589. NodeInfo: THTMLNodeInfo;
  590. Element: TDOMElement;
  591. i: Integer;
  592. begin
  593. {$ifdef SAX_HTML_DEBUG}
  594. WriteLn('Start: ', LocalName, '. Node buffer before: ', FNodeBuffer.Count, ' elements');
  595. {$endif}
  596. Element := FDocument.CreateElement(LocalName);
  597. if Assigned(Attr) then
  598. begin
  599. {$ifdef SAX_HTML_DEBUG}
  600. WriteLn('Attribute: ', Attr.GetLength);
  601. {$endif}
  602. for i := 0 to Attr.GetLength - 1 do
  603. begin
  604. {$ifdef SAX_HTML_DEBUG}
  605. WriteLn('#', i, ': LocalName = ', Attr.GetLocalName(i), ', Value = ', Attr.GetValue(i));
  606. {$endif}
  607. Element[Attr.GetLocalName(i)] := Attr.GetValue(i);
  608. end;
  609. end;
  610. NodeInfo := THTMLNodeInfo.Create;
  611. NodeInfo.NodeType := ntTag;
  612. NodeInfo.DOMNode := Element;
  613. if IsFragmentMode then
  614. begin
  615. if not FragmentRootSet then
  616. begin
  617. FragmentRoot.AppendChild(Element);
  618. FragmentRootSet := True;
  619. end;
  620. end else
  621. if not Assigned(FDocument.DocumentElement) then
  622. FDocument.AppendChild(Element);
  623. FNodeBuffer.Add(NodeInfo);
  624. {$ifdef SAX_HTML_DEBUG}
  625. WriteLn('Start: ', LocalName, '. Node buffer after: ', FNodeBuffer.Count, ' elements');
  626. {$endif}
  627. end;
  628. procedure THTMLToDOMConverter.ReaderEndElement(Sender: TObject;
  629. const NamespaceURI, LocalName, RawName: SAXString);
  630. var
  631. NodeInfo, NodeInfo2: THTMLNodeInfo;
  632. i : Integer;
  633. j : THTMLElementTag;
  634. TagInfo: PHTMLElementProps;
  635. begin
  636. {$ifdef SAX_HTML_DEBUG}
  637. WriteLn('End: ', LocalName, '. Node buffer: ', FNodeBuffer.Count, ' elements');
  638. {$endif}
  639. // Find the matching start tag
  640. i := FNodeBuffer.Count - 1;
  641. while i >= 0 do
  642. begin
  643. NodeInfo := THTMLNodeInfo(FNodeBuffer.Items[i]);
  644. if (NodeInfo.NodeType = ntTag) and (not NodeInfo.Closed) and
  645. (CompareText(NodeInfo.DOMNode.NodeName, LocalName) = 0) then
  646. begin
  647. // We found the matching start tag
  648. TagInfo := nil;
  649. for j := Low(THTMLElementTag) to High(THTMLElementTag) do
  650. if CompareText(HTMLElementProps[j].Name, LocalName) = 0 then
  651. begin
  652. TagInfo := @HTMLElementProps[j];
  653. break;
  654. end;
  655. Inc(i);
  656. while i < FNodeBuffer.Count do
  657. begin
  658. NodeInfo2 := THTMLNodeInfo(FNodeBuffer.Items[i]);
  659. if (NodeInfo2.NodeType = ntWhitespace) and Assigned(TagInfo) and
  660. (not (efPreserveWhitespace in TagInfo^.Flags)) then
  661. // Handle whitespace, which doesn't need to get preserved...
  662. if not (efPCDATAContent in TagInfo^.Flags) then
  663. // No character data allowed within the current element
  664. NodeInfo2.DOMNode.Free
  665. else
  666. begin
  667. // Character data allowed, so normalize it
  668. NodeInfo2.DOMNode.NodeValue := ' ';
  669. NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode)
  670. end
  671. else
  672. NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode);
  673. NodeInfo2.Free;
  674. FNodeBuffer.Delete(i);
  675. end;
  676. NodeInfo.Closed := True;
  677. break;
  678. end;
  679. Dec(i);
  680. end;
  681. end;
  682. procedure ReadHTMLFile(out ADoc: THTMLDocument; const AFilename: String);
  683. var
  684. f: TStream;
  685. begin
  686. ADoc := nil;
  687. f := TFileStream.Create(AFilename, fmOpenRead);
  688. try
  689. ReadHTMLFile(ADoc, f);
  690. finally
  691. f.Free;
  692. end;
  693. end;
  694. procedure ReadHTMLFile(out ADoc: THTMLDocument; f: TStream);
  695. var
  696. Reader: THTMLReader;
  697. Converter: THTMLToDOMConverter;
  698. begin
  699. ADoc := THTMLDocument.Create;
  700. Reader := THTMLReader.Create;
  701. try
  702. Converter := THTMLToDOMConverter.Create(Reader, ADoc);
  703. try
  704. Reader.ParseStream(f);
  705. finally
  706. Converter.Free;
  707. end;
  708. finally
  709. Reader.Free;
  710. end;
  711. end;
  712. procedure ReadHTMLFragment(AParentNode: TDOMNode; const AFilename: String);
  713. var
  714. f: TStream;
  715. begin
  716. f := TFileStream.Create(AFilename, fmOpenRead);
  717. try
  718. ReadHTMLFragment(AParentNode, f);
  719. finally
  720. f.Free;
  721. end;
  722. end;
  723. procedure ReadHTMLFragment(AParentNode: TDOMNode; f: TStream);
  724. var
  725. Reader: THTMLReader;
  726. Converter: THTMLToDOMConverter;
  727. begin
  728. Reader := THTMLReader.Create;
  729. try
  730. Converter := THTMLToDOMConverter.CreateFragment(Reader, AParentNode);
  731. try
  732. Reader.ParseStream(f);
  733. finally
  734. Converter.Free;
  735. end;
  736. finally
  737. Reader.Free;
  738. end;
  739. end;
  740. end.