sax_html.pp 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829
  1. {
  2. This file is part of the Free Component Library
  3. HTML parser with SAX-like interface
  4. Copyright (c) 2000-2002 by
  5. Areca Systems GmbH / Sebastian Guenther, [email protected]
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {
  13. Known problems:
  14. * The whitespace handling does only work for processing the DOM tree.
  15. Storing the DOM tree to a XML file will result in a quite ugly file.
  16. (This probably has got much better with recent versions, which do
  17. decent whitespace converting, but it's not tested really good.)
  18. * Entity references in attribute values don't get parsed.
  19. }
  20. {$mode objfpc}
  21. {$H+}
  22. {$IFNDEF FPC_DOTTEDUNITS}
  23. unit SAX_HTML;
  24. {$ENDIF FPC_DOTTEDUNITS}
  25. interface
  26. {$IFDEF FPC_DOTTEDUNITS}
  27. uses System.SysUtils, System.Classes, Xml.Sax, Xml.Dom, Html.Dom, Html.Defs,Xml.Utils;
  28. {$ELSE FPC_DOTTEDUNITS}
  29. uses SysUtils, Classes, SAX, DOM, DOM_HTML,htmldefs,xmlutils;
  30. {$ENDIF FPC_DOTTEDUNITS}
  31. type
  32. { THTMLReader: The HTML reader class }
  33. THTMLScannerContext = (
  34. scUnknown,
  35. scWhitespace, // within whitespace
  36. scText, // within text
  37. scEntityReference, // within entity reference ("&...;")
  38. scTag, // within a start tag or end tag
  39. scComment,
  40. scScript
  41. );
  42. THTMLReader = class(TSAXReader)
  43. private
  44. FStarted: Boolean;
  45. FEndOfStream: Boolean;
  46. FScannerContext: THTMLScannerContext;
  47. FTokenText: SAXString;
  48. FRawTokenText: string;
  49. FScriptEndTag: string;
  50. FScriptEndMatchPos: Integer;
  51. FCurStringValueDelimiter: AnsiChar;
  52. FAttrNameRead: Boolean;
  53. FStack: array of THTMLElementTag;
  54. FNesting: Integer;
  55. procedure AutoClose(const aName: SAXString);
  56. procedure NamePush(const aName: SAXString);
  57. procedure NamePop;
  58. protected
  59. procedure EnterNewScannerContext(NewContext: THTMLScannerContext);
  60. public
  61. constructor Create;
  62. destructor Destroy; override;
  63. procedure Parse(AInput: TSAXInputSource); override; overload;
  64. property EndOfStream: Boolean read FEndOfStream;
  65. property ScannerContext: THTMLScannerContext read FScannerContext;
  66. property TokenText: SAXString read FTokenText;
  67. end;
  68. { THTMLToDOMConverter }
  69. THTMLNodeType = (ntWhitespace, ntText, ntEntityReference, ntTag);
  70. THTMLNodeInfo = class
  71. NodeType: THTMLNodeType;
  72. Closed: Boolean;
  73. DOMNode: TDOMNode;
  74. end;
  75. THTMLToDOMConverter = class
  76. private
  77. FReader: THTMLReader;
  78. FDocument: TDOMDocument;
  79. FNodeBuffer: TList;
  80. IsFragmentMode, FragmentRootSet: Boolean;
  81. FragmentRoot: TDOMNode;
  82. procedure ReaderCharacters(Sender: TObject; const ch: PSAXChar;
  83. Start, Count: Integer);
  84. procedure ReaderIgnorableWhitespace(Sender: TObject; const ch: PSAXChar;
  85. Start, Count: Integer);
  86. procedure ReaderStartElement(Sender: TObject;
  87. const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
  88. procedure ReaderEndElement(Sender: TObject;
  89. const NamespaceURI, LocalName, RawName: SAXString);
  90. public
  91. constructor Create(AReader: THTMLReader; ADocument: TDOMDocument);
  92. constructor CreateFragment(AReader: THTMLReader; AFragmentRoot: TDOMNode);
  93. destructor Destroy; override;
  94. Property Document : TDOMDocument Read FDocument;
  95. end;
  96. // Helper functions; these ones are HTML equivalents of ReadXML[File|Fragment]
  97. procedure ReadHTMLFile(out ADoc: THTMLDocument; const AFilename: String);
  98. procedure ReadHTMLFile(out ADoc: THTMLDocument; f: TStream);
  99. procedure ReadHTMLFragment(AParentNode: TDOMNode; const AFilename: String);
  100. procedure ReadHTMLFragment(AParentNode: TDOMNode; f: TStream);
  101. implementation
  102. constructor THTMLReader.Create;
  103. begin
  104. inherited Create;
  105. FScannerContext := scUnknown;
  106. SetLength(FStack, 16);
  107. end;
  108. destructor THTMLReader.Destroy;
  109. begin
  110. if FStarted then
  111. DoEndDocument;
  112. inherited Destroy;
  113. end;
  114. function CheckForName(const Tag: SAXString): Boolean;
  115. var
  116. p, p1: PSAXChar;
  117. begin
  118. p := PSAXChar(Tag);
  119. result := False;
  120. if p^ <> '!' then
  121. begin
  122. if p^ = '/' then Inc(p);
  123. p1 := p;
  124. while (p1^ <> #0) and (p1^ <> '/') and not IsXMLWhitespace(p1^) do
  125. Inc(p1);
  126. result := IsXMLName(p, p1-p);
  127. end;
  128. end;
  129. procedure THTMLReader.Parse(AInput: TSAXInputSource);
  130. const
  131. MaxBufferSize = 1024;
  132. var
  133. Buffer: array[0..MaxBufferSize - 1] of AnsiChar;
  134. BufferSize, BufferPos: Integer;
  135. len: Integer;
  136. ch: AnsiChar;
  137. begin
  138. if not FStarted then
  139. begin
  140. FStarted := True;
  141. DoStartDocument;
  142. end;
  143. FEndOfStream := False;
  144. FStopFlag := False;
  145. while not FStopFlag do
  146. begin
  147. // Read data into the input buffer
  148. BufferSize := AInput.Stream.Read(Buffer, MaxBufferSize);
  149. if BufferSize = 0 then
  150. begin
  151. FEndOfStream := True;
  152. break;
  153. end;
  154. BufferPos := 0;
  155. while (BufferPos < BufferSize) and not FStopFlag do
  156. begin
  157. case ScannerContext of
  158. scUnknown:
  159. case Buffer[BufferPos] of
  160. #9, #10, #13, ' ':
  161. EnterNewScannerContext(scWhitespace);
  162. '&':
  163. begin
  164. Inc(BufferPos);
  165. EnterNewScannerContext(scEntityReference);
  166. end;
  167. '<':
  168. begin
  169. Inc(BufferPos);
  170. EnterNewScannerContext(scTag);
  171. end;
  172. else
  173. EnterNewScannerContext(scText);
  174. end;
  175. scWhitespace:
  176. case Buffer[BufferPos] of
  177. #9, #10, #13, ' ':
  178. begin
  179. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  180. Inc(BufferPos);
  181. end;
  182. '&':
  183. begin
  184. Inc(BufferPos);
  185. EnterNewScannerContext(scEntityReference);
  186. end;
  187. '<':
  188. begin
  189. Inc(BufferPos);
  190. EnterNewScannerContext(scTag);
  191. end;
  192. else
  193. FScannerContext := scText;
  194. end;
  195. scText:
  196. case Buffer[BufferPos] of
  197. '&':
  198. begin
  199. Inc(BufferPos);
  200. EnterNewScannerContext(scEntityReference);
  201. end;
  202. '<':
  203. begin
  204. Inc(BufferPos);
  205. EnterNewScannerContext(scTag);
  206. end;
  207. else
  208. begin
  209. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  210. Inc(BufferPos);
  211. end;
  212. end;
  213. scEntityReference:
  214. if Buffer[BufferPos] = ';' then
  215. begin
  216. Inc(BufferPos);
  217. EnterNewScannerContext(scUnknown);
  218. end else if not (Buffer[BufferPos] in
  219. ['a'..'z', 'A'..'Z', '0'..'9', '#']) then
  220. EnterNewScannerContext(scUnknown)
  221. else
  222. begin
  223. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  224. Inc(BufferPos);
  225. end;
  226. scTag:
  227. case Buffer[BufferPos] of
  228. '''', '"':
  229. begin
  230. if FAttrNameRead then
  231. begin
  232. if FCurStringValueDelimiter = #0 then
  233. FCurStringValueDelimiter := Buffer[BufferPos]
  234. else if FCurStringValueDelimiter = Buffer[BufferPos] then
  235. begin
  236. FCurStringValueDelimiter := #0;
  237. FAttrNameRead := False;
  238. end;
  239. end;
  240. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  241. Inc(BufferPos);
  242. end;
  243. '=':
  244. begin
  245. FAttrNameRead := True;
  246. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  247. Inc(BufferPos);
  248. end;
  249. '>':
  250. begin
  251. Inc(BufferPos);
  252. if FCurStringValueDelimiter = #0 then
  253. EnterNewScannerContext(scUnknown);
  254. end;
  255. '<': // either an unclosed tag or unescaped '<' in text; attempt recovery
  256. begin
  257. // TODO: this check is hardly complete, probably must also check if
  258. // tag name is followed by legal attributes.
  259. if CheckForName(FRawTokenText) then { <-- ansi to wide conversion here }
  260. EnterNewScannerContext(scUnknown) // assume unclosed tag
  261. else if (FRawTokenText <> '') and (FRawTokenText[1] <> '!') then
  262. begin
  263. Insert('<', FRawTokenText, 1); // assume plaintext
  264. FScannerContext := scText;
  265. EnterNewScannerContext(scUnknown);
  266. end
  267. else
  268. begin // in comment, ignore
  269. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  270. Inc(BufferPos);
  271. end;
  272. end;
  273. else
  274. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  275. if FRawTokenText='!--' then
  276. begin
  277. FScannerContext := scComment;
  278. FRawTokenText := '';
  279. end;
  280. Inc(BufferPos);
  281. end;
  282. scComment:
  283. begin
  284. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  285. Inc(BufferPos);
  286. if (Buffer[BufferPos-1]='>') then
  287. begin
  288. len:=length(FRawTokenText);
  289. if (len>2) and (FRawTokenText[len-1]='-') and (FRawTokenText[len-2]='-') then
  290. begin
  291. Delete(FRawTokenText, Length(FRawTokenText)-2, MaxInt);
  292. EnterNewScannerContext(scUnknown);
  293. end;
  294. end;
  295. end;
  296. scScript:
  297. begin
  298. ch := Buffer[BufferPos];
  299. if FScriptEndMatchPos <= Length(FScriptEndTag) then
  300. begin
  301. if lowercase(ch) = FScriptEndTag[FScriptEndMatchPos] then
  302. Inc(FScriptEndMatchPos)
  303. else
  304. FScriptEndMatchPos := 1;
  305. FRawTokenText := FRawTokenText + ch;
  306. Inc(BufferPos);
  307. end
  308. else
  309. begin
  310. case ch of
  311. #9,#10,#13,' ':
  312. begin
  313. FRawTokenText := FRawTokenText + ch;
  314. Inc(BufferPos);
  315. Inc(FScriptEndMatchPos);
  316. end;
  317. '>':
  318. begin
  319. Inc(BufferPos);
  320. Delete(FRawTokenText, Length(FRawTokenText)-FScriptEndMatchPos+2, MaxInt);
  321. EnterNewScannerContext(scUnknown);
  322. end;
  323. else
  324. FRawTokenText := FRawTokenText + ch;
  325. Inc(BufferPos);
  326. FScriptEndMatchPos := 1;
  327. end;
  328. end;
  329. end;
  330. end; // case ScannerContext of
  331. end; // while not endOfBuffer
  332. end;
  333. end;
  334. function LookupTag(const aName: SAXString): THTMLElementTag;
  335. var
  336. j: THTMLElementTag;
  337. ansiName: string;
  338. begin
  339. ansiName := aName;
  340. for j := Low(THTMLElementTag) to High(THTMLElementTag) do
  341. if SameText(HTMLElementProps[j].Name, ansiName) then
  342. begin
  343. Result := j;
  344. Exit;
  345. end;
  346. Result := etUnknown;
  347. end;
  348. procedure THTMLReader.AutoClose(const aName: SAXString);
  349. var
  350. newTag: THTMLElementTag;
  351. begin
  352. newTag := LookupTag(aName);
  353. while (FNesting > 0) and IsAutoClose(newTag, FStack[FNesting-1]) do
  354. begin
  355. DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
  356. namePop;
  357. end;
  358. end;
  359. procedure THTMLReader.NamePush(const aName: SAXString);
  360. var
  361. tag: THTMLElementTag;
  362. begin
  363. tag := LookupTag(aName);
  364. if FNesting >= Length(FStack) then
  365. SetLength(FStack, FNesting * 2);
  366. FStack[FNesting] := tag;
  367. Inc(FNesting);
  368. end;
  369. procedure THTMLReader.NamePop;
  370. begin
  371. if FNesting <= 0 then
  372. Exit;
  373. Dec(FNesting);
  374. FStack[FNesting] := etUnknown;
  375. end;
  376. function SplitTagString(const s: SAXString; var Attr: TSAXAttributes): SAXString;
  377. var
  378. i, j, len: Integer;
  379. AttrName: SAXString;
  380. ValueDelimiter: WideChar;
  381. haseq, hasname: Boolean;
  382. begin
  383. Attr := nil;
  384. i := 0;
  385. len := Length(s);
  386. repeat
  387. Inc(i)
  388. until (i > len) or IsXMLWhitespace(s[i]);
  389. Result := Copy(s, 1, i - 1);
  390. WStrLower(Result);
  391. if i > len then
  392. exit;
  393. Attr := TSAXAttributes.Create;
  394. Inc(i);
  395. repeat
  396. while (i <= len) and IsXMLWhitespace(s[i]) do
  397. Inc(i);
  398. if (i > len) then
  399. break;
  400. j := i;
  401. haseq := false;
  402. hasname := false;
  403. ValueDelimiter := #0;
  404. // Attr Name
  405. while (j <= len) and not (IsXMLWhitespace(s[j]) or (s[j] = '=')) do
  406. Inc(j);
  407. // j points on =, whitespace or after s
  408. AttrName := Copy(s, i, j - i);
  409. WStrLower(AttrName);
  410. hasname := IsXMLName(AttrName);
  411. // Look for = and following whitespaces (maybe j already points on =)
  412. while (j <= len) and (IsXMLWhitespace(s[j]) or (s[j] = '=')) do
  413. begin
  414. if (s[j] = '=') then
  415. haseq := true;
  416. Inc(j);
  417. end;
  418. // Value (j points on first nonblank or after end)
  419. if haseq then
  420. begin
  421. if (j > len) then { terminal case <tag attr=> }
  422. begin
  423. if hasname then
  424. Attr.AddAttribute('', AttrName, '', '', '');
  425. break;
  426. end
  427. else
  428. begin
  429. if (s[j]='''') or (s[j]='"') then
  430. begin
  431. ValueDelimiter := s[j];
  432. Inc(j);
  433. end;
  434. i := j;
  435. while (j <= len) do
  436. begin
  437. if (s[j]=ValueDelimiter) then
  438. break;
  439. if (ValueDelimiter=#0) and IsXMLWhitespace(s[j]) then
  440. break;
  441. Inc(j);
  442. end;
  443. if hasname then
  444. Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i))
  445. end;
  446. end
  447. else if hasname then { html boolean-style attribute }
  448. Attr.AddAttribute('', AttrName, '', '', AttrName);
  449. { skip closing quote if one is present }
  450. i := j + ord(ValueDelimiter<>#0);
  451. until false;
  452. end;
  453. procedure THTMLReader.EnterNewScannerContext(NewContext: THTMLScannerContext);
  454. var
  455. Attr: TSAXAttributes;
  456. TagName: SAXString;
  457. Ent: SAXChar;
  458. i: Integer;
  459. elTag: THTMLElementTag;
  460. begin
  461. FTokenText := FRawTokenText;
  462. case ScannerContext of
  463. scWhitespace:
  464. if (FNesting > 0) and (efPCDataContent in HTMLElementProps[FStack[FNesting-1]].Flags) then
  465. DoCharacters(PSAXChar(TokenText), 0, Length(TokenText))
  466. else
  467. DoIgnorableWhitespace(PSAXChar(TokenText), 0, Length(TokenText));
  468. scText:
  469. DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
  470. scEntityReference:
  471. begin
  472. if ResolveHTMLEntityReference(TokenText, Ent) then
  473. DoCharacters(@Ent, 0, 1)
  474. else
  475. DoCharacters(PSAXChar('&' + TokenText + ';'), 0, Length(TokenText) + 2);
  476. end;
  477. scTag:
  478. if Length(TokenText) > 0 then
  479. begin
  480. Attr := nil;
  481. if TokenText[Length(fTokenText)]='/' then // handle xml/xhtml style empty tag
  482. begin
  483. setlength(fTokenText,length(fTokenText)-1);
  484. // Do NOT combine to a single line, as Attr is an output value!
  485. TagName := SplitTagString(TokenText, Attr);
  486. AutoClose(TagName);
  487. DoStartElement('', TagName, '', Attr);
  488. DoEndElement('', TagName, '');
  489. end
  490. else if TokenText[1] = '/' then
  491. begin
  492. Delete(FTokenText, 1, 1);
  493. TagName := SplitTagString(TokenText, Attr);
  494. elTag := LookupTag(TagName);
  495. i := FNesting-1;
  496. while (i >= 0) and (FStack[i] <> elTag) and
  497. (efEndTagOptional in HTMLElementProps[FStack[i]].Flags) do
  498. Dec(i);
  499. if (i>=0) and (FStack[i] = elTag) then
  500. while FStack[FNesting-1] <> elTag do
  501. begin
  502. DoEndElement('', HTMLElementProps[FStack[FNesting-1]].Name, '');
  503. namePop;
  504. end;
  505. DoEndElement('', TagName, '');
  506. namePop;
  507. end
  508. else if TokenText[1] <> '!' then
  509. begin
  510. // Do NOT combine to a single line, as Attr is an output value!
  511. TagName := SplitTagString(TokenText, Attr);
  512. AutoClose(TagName);
  513. namePush(TagName);
  514. DoStartElement('', TagName, '', Attr);
  515. if FStack[FNesting-1] in [etScript,etStyle] then
  516. begin
  517. NewContext := scScript;
  518. FScriptEndTag := '</' + HTMLElementProps[FStack[FNesting-1]].Name;
  519. FScriptEndMatchPos := 1;
  520. end;
  521. if (efSubcontent*HTMLElementProps[FStack[FNesting-1]].Flags=[]) then begin
  522. // do not push empty elements, don't wait for AutoClose
  523. DoEndElement('', TagName, '');
  524. NamePop;
  525. end;
  526. end;
  527. if Assigned(Attr) then
  528. Attr.Free;
  529. end;
  530. scComment:
  531. begin
  532. DoComment(PSAXChar(TokenText), 0, Length(TokenText));
  533. end;
  534. scScript:
  535. begin
  536. DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
  537. DoEndElement('', Copy(FScriptEndTag, 3), '');
  538. NamePop;
  539. FScriptEndTag := '';
  540. end;
  541. end;
  542. FScannerContext := NewContext;
  543. FTokenText := '';
  544. FRawTokenText := '';
  545. FCurStringValueDelimiter := #0;
  546. FAttrNameRead := False;
  547. end;
  548. { THTMLToDOMConverter }
  549. constructor THTMLToDOMConverter.Create(AReader: THTMLReader;
  550. ADocument: TDOMDocument);
  551. begin
  552. inherited Create;
  553. FReader := AReader;
  554. FReader.OnCharacters := @ReaderCharacters;
  555. FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
  556. FReader.OnStartElement := @ReaderStartElement;
  557. FReader.OnEndElement := @ReaderEndElement;
  558. FDocument := ADocument;
  559. FNodeBuffer := TList.Create;
  560. end;
  561. constructor THTMLToDOMConverter.CreateFragment(AReader: THTMLReader;
  562. AFragmentRoot: TDOMNode);
  563. begin
  564. Create(AReader, AFragmentRoot.OwnerDocument);
  565. FragmentRoot := AFragmentRoot;
  566. IsFragmentMode := True;
  567. end;
  568. destructor THTMLToDOMConverter.Destroy;
  569. var
  570. i: Integer;
  571. begin
  572. // Theoretically, always exactly one item will remain - the root element:
  573. for i := 0 to FNodeBuffer.Count - 1 do
  574. THTMLNodeInfo(FNodeBuffer[i]).Free;
  575. FNodeBuffer.Free;
  576. inherited Destroy;
  577. end;
  578. procedure THTMLToDOMConverter.ReaderCharacters(Sender: TObject;
  579. const ch: PSAXChar; Start, Count: Integer);
  580. var
  581. NodeInfo: THTMLNodeInfo;
  582. begin
  583. NodeInfo := THTMLNodeInfo.Create;
  584. NodeInfo.NodeType := ntText;
  585. NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
  586. FNodeBuffer.Add(NodeInfo);
  587. end;
  588. procedure THTMLToDOMConverter.ReaderIgnorableWhitespace(Sender: TObject;
  589. const ch: PSAXChar; Start, Count: Integer);
  590. var
  591. NodeInfo: THTMLNodeInfo;
  592. begin
  593. NodeInfo := THTMLNodeInfo.Create;
  594. NodeInfo.NodeType := ntWhitespace;
  595. NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
  596. FNodeBuffer.Add(NodeInfo);
  597. end;
  598. procedure THTMLToDOMConverter.ReaderStartElement(Sender: TObject;
  599. const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
  600. var
  601. NodeInfo: THTMLNodeInfo;
  602. Element: TDOMElement;
  603. i: Integer;
  604. begin
  605. {$ifdef SAX_HTML_DEBUG}
  606. WriteLn('Start: ', LocalName, '. Node buffer before: ', FNodeBuffer.Count, ' elements');
  607. {$endif}
  608. Element := FDocument.CreateElement(LocalName);
  609. if Assigned(Attr) then
  610. begin
  611. {$ifdef SAX_HTML_DEBUG}
  612. WriteLn('Attribute: ', Attr.GetLength);
  613. {$endif}
  614. for i := 0 to Attr.GetLength - 1 do
  615. begin
  616. {$ifdef SAX_HTML_DEBUG}
  617. WriteLn('#', i, ': LocalName = ', Attr.GetLocalName(i), ', Value = ', Attr.GetValue(i));
  618. {$endif}
  619. Element[Attr.GetLocalName(i)] := Attr.GetValue(i);
  620. end;
  621. end;
  622. NodeInfo := THTMLNodeInfo.Create;
  623. NodeInfo.NodeType := ntTag;
  624. NodeInfo.DOMNode := Element;
  625. NodeInfo.Closed := false;
  626. if IsFragmentMode then
  627. begin
  628. if not FragmentRootSet then
  629. begin
  630. FragmentRoot.AppendChild(Element);
  631. FragmentRootSet := True;
  632. end;
  633. end else
  634. if not Assigned(FDocument.DocumentElement) then
  635. FDocument.AppendChild(Element);
  636. FNodeBuffer.Add(NodeInfo);
  637. {$ifdef SAX_HTML_DEBUG}
  638. WriteLn('Start: ', LocalName, '. Node buffer after: ', FNodeBuffer.Count, ' elements');
  639. {$endif}
  640. end;
  641. procedure THTMLToDOMConverter.ReaderEndElement(Sender: TObject;
  642. const NamespaceURI, LocalName, RawName: SAXString);
  643. var
  644. NodeInfo, NodeInfo2: THTMLNodeInfo;
  645. i : Integer;
  646. j : THTMLElementTag;
  647. TagInfo: PHTMLElementProps;
  648. begin
  649. {$ifdef SAX_HTML_DEBUG}
  650. WriteLn('End: ', LocalName, '. Node buffer: ', FNodeBuffer.Count, ' elements');
  651. {$endif}
  652. // Find the matching start tag
  653. i := FNodeBuffer.Count - 1;
  654. while i >= 0 do
  655. begin
  656. NodeInfo := THTMLNodeInfo(FNodeBuffer.Items[i]);
  657. if (NodeInfo.NodeType = ntTag) and (not NodeInfo.Closed) and
  658. (CompareText(NodeInfo.DOMNode.NodeName, LocalName) = 0) then
  659. begin
  660. // We found the matching start tag
  661. TagInfo := nil;
  662. for j := Low(THTMLElementTag) to High(THTMLElementTag) do
  663. if CompareText(HTMLElementProps[j].Name, LocalName) = 0 then
  664. begin
  665. TagInfo := @HTMLElementProps[j];
  666. break;
  667. end;
  668. Inc(i);
  669. while i < FNodeBuffer.Count do
  670. begin
  671. NodeInfo2 := THTMLNodeInfo(FNodeBuffer.Items[i]);
  672. if (NodeInfo2.NodeType = ntWhitespace) and Assigned(TagInfo) and
  673. (not (efPreserveWhitespace in TagInfo^.Flags)) then
  674. // Handle whitespace, which doesn't need to get preserved...
  675. if not (efPCDATAContent in TagInfo^.Flags) then
  676. // No character data allowed within the current element
  677. NodeInfo2.DOMNode.Free
  678. else
  679. begin
  680. // Character data allowed, so normalize it
  681. NodeInfo2.DOMNode.NodeValue := ' ';
  682. NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode)
  683. end
  684. else
  685. NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode);
  686. NodeInfo2.Free;
  687. FNodeBuffer.Delete(i);
  688. end;
  689. NodeInfo.Closed := True;
  690. break;
  691. end;
  692. Dec(i);
  693. end;
  694. end;
  695. procedure ReadHTMLFile(out ADoc: THTMLDocument; const AFilename: String);
  696. var
  697. f: TStream;
  698. begin
  699. ADoc := nil;
  700. f := TFileStream.Create(AFilename, fmOpenRead);
  701. try
  702. ReadHTMLFile(ADoc, f);
  703. finally
  704. f.Free;
  705. end;
  706. end;
  707. procedure ReadHTMLFile(out ADoc: THTMLDocument; f: TStream);
  708. var
  709. Reader: THTMLReader;
  710. Converter: THTMLToDOMConverter;
  711. begin
  712. ADoc := THTMLDocument.Create;
  713. Reader := THTMLReader.Create;
  714. try
  715. Converter := THTMLToDOMConverter.Create(Reader, ADoc);
  716. try
  717. Reader.ParseStream(f);
  718. Converter.Document.RebuildIDList;
  719. finally
  720. Converter.Free;
  721. end;
  722. finally
  723. Reader.Free;
  724. end;
  725. end;
  726. procedure ReadHTMLFragment(AParentNode: TDOMNode; const AFilename: String);
  727. var
  728. f: TStream;
  729. begin
  730. f := TFileStream.Create(AFilename, fmOpenRead);
  731. try
  732. ReadHTMLFragment(AParentNode, f);
  733. finally
  734. f.Free;
  735. end;
  736. end;
  737. procedure ReadHTMLFragment(AParentNode: TDOMNode; f: TStream);
  738. var
  739. Reader: THTMLReader;
  740. Converter: THTMLToDOMConverter;
  741. begin
  742. Reader := THTMLReader.Create;
  743. try
  744. Converter := THTMLToDOMConverter.CreateFragment(Reader, AParentNode);
  745. try
  746. Reader.ParseStream(f);
  747. if aParentNode is TDOMElement then
  748. Converter.Document.RebuildIDsOfElement(aParentNode as TDOMElement)
  749. else
  750. Converter.Document.RebuildIDList;
  751. finally
  752. Converter.Free;
  753. end;
  754. finally
  755. Reader.Free;
  756. end;
  757. end;
  758. end.