sax_xml.pp 18 KB


  1. {
  2. This file is part of the Free Component Library
  3. Copyright (c) 2006 by Michael Van Canneyt.
  4. Based on SAX_HTML implementation from Sebastian Guenther.
  5. XML parser with SAX interface
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$mode objfpc}
  13. {$h+}
  14. {$IFNDEF FPC_DOTTEDUNITS}
  15. unit SAX_XML;
  16. {$ENDIF FPC_DOTTEDUNITS}
  17. interface
  18. {$IFDEF FPC_DOTTEDUNITS}
  19. uses System.SysUtils, System.Classes, Xml.Sax, Xml.Dom;
  20. {$ELSE FPC_DOTTEDUNITS}
  21. uses SysUtils, Classes, SAX, DOM;
  22. {$ENDIF FPC_DOTTEDUNITS}
  23. type
  24. { TXMLReader: The XML reader class }
  25. TXMLScannerContext = (
  26. scUnknown,
  27. scWhitespace, // within whitespace
  28. scText, // within text
  29. scCData, // within cdata section
  30. scComment, // within comment
  31. scEntityReference, // within entity reference ("&...;")
  32. scTag); // within a start tag or end tag
  33. TSAXXMLReader = class(TSAXReader)
  34. private
  35. FStarted: Boolean;
  36. FEndOfStream: Boolean;
  37. FScannerContext: TXMLScannerContext;
  38. FTokenText: SAXString;
  39. FRawTokenText: string;
  40. FCurStringValueDelimiter: AnsiChar;
  41. FAttrNameRead: Boolean;
  42. protected
  43. procedure EnterNewScannerContext(NewContext: TXMLScannerContext);
  44. public
  45. constructor Create;
  46. destructor Destroy; override;
  47. procedure Parse(AInput: TSAXInputSource); override; overload;
  48. property EndOfStream: Boolean read FEndOfStream;
  49. property ScannerContext: TXMLScannerContext read FScannerContext;
  50. property TokenText: SAXString read FTokenText;
  51. end;
  52. { TXMLToDOMConverter }
  53. TXMLNodeType = (ntWhitespace, ntText, ntEntityReference, ntTag, ntComment);
  54. TXMLNodeInfo = class
  55. NodeType: TXMLNodeType;
  56. DOMNode: TDOMNode;
  57. end;
  58. TXMLToDOMConverter = class
  59. private
  60. FReader: TSAXXMLReader;
  61. FDocument: TDOMDocument;
  62. FElementStack: TList;
  63. FNodeBuffer: TList;
  64. IsFragmentMode, FragmentRootSet: Boolean;
  65. FragmentRoot: TDOMNode;
  66. procedure ReaderCharacters(Sender: TObject; const ch: PSAXChar;
  67. Start, Count: Integer);
  68. procedure ReaderComment(Sender: TObject; const ch: PSAXChar;
  69. Start, Count: Integer);
  70. procedure ReaderIgnorableWhitespace(Sender: TObject; const ch: PSAXChar;
  71. Start, Count: Integer);
  72. procedure ReaderSkippedEntity(Sender: TObject; const Name: SAXString);
  73. procedure ReaderStartElement(Sender: TObject;
  74. const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
  75. procedure ReaderEndElement(Sender: TObject;
  76. const NamespaceURI, LocalName, RawName: SAXString);
  77. public
  78. constructor Create(AReader: TSAXXMLReader; ADocument: TDOMDocument);
  79. constructor CreateFragment(AReader: TSAXXMLReader; AFragmentRoot: TDOMNode);
  80. destructor Destroy; override;
  81. end;
  82. // Helper functions; these ones are XML equivalents of ReadXML[File|Fragment]
  83. procedure ReadXMLFile(out ADoc: TXMLDocument; const AFilename: String);
  84. procedure ReadXMLFile(out ADoc: TXMLDocument; f: TStream);
  85. procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String);
  86. procedure ReadXMLFragment(AParentNode: TDOMNode; f: TStream);
  87. implementation
  88. {$IFDEF FPC_DOTTEDUNITS}
  89. uses
  90. Xml.Utils,
  91. Html.Defs; // for entities...
  92. {$ELSE FPC_DOTTEDUNITS}
  93. uses
  94. xmlutils,
  95. htmldefs; // for entities...
  96. {$ENDIF FPC_DOTTEDUNITS}
  97. const
  98. WhitespaceChars = [#9, #10, #13, ' '];
  99. char_lt: SAXChar = '<';
  100. char_gt: SAXChar = '>';
  101. char_quot: SAXChar = '"';
  102. char_apos: SAXChar = '''';
  103. char_amp: SAXChar = '&';
  104. constructor TSAXXMLReader.Create;
  105. begin
  106. inherited Create;
  107. FScannerContext := scUnknown;
  108. end;
  109. destructor TSAXXMLReader.Destroy;
  110. begin
  111. if FStarted then
  112. DoEndDocument;
  113. inherited Destroy;
  114. end;
  115. procedure TSAXXMLReader.Parse(AInput: TSAXInputSource);
  116. const
  117. MaxBufferSize = 1024;
  118. var
  119. Buffer: array[0..MaxBufferSize - 1] of AnsiChar;
  120. BufferSize, BufferPos: Integer;
  121. begin
  122. if not FStarted then
  123. begin
  124. FStarted := True;
  125. DoStartDocument;
  126. end;
  127. FEndOfStream := False;
  128. FStopFlag := False;
  129. while not FStopFlag do
  130. begin
  131. // Read data into the input buffer
  132. BufferSize := AInput.Stream.Read(Buffer, MaxBufferSize);
  133. if BufferSize = 0 then
  134. begin
  135. FEndOfStream := True;
  136. break;
  137. end;
  138. BufferPos := 0;
  139. while (BufferPos < BufferSize) and not FStopFlag do
  140. begin
  141. case ScannerContext of
  142. scUnknown:
  143. case Buffer[BufferPos] of
  144. #9, #10, #13, ' ':
  145. EnterNewScannerContext(scWhitespace);
  146. '&':
  147. begin
  148. Inc(BufferPos);
  149. EnterNewScannerContext(scEntityReference);
  150. end;
  151. '<':
  152. begin
  153. Inc(BufferPos);
  154. EnterNewScannerContext(scTag);
  155. end;
  156. else
  157. EnterNewScannerContext(scText);
  158. end;
  159. scWhitespace:
  160. case Buffer[BufferPos] of
  161. #9, #10, #13, ' ':
  162. begin
  163. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  164. Inc(BufferPos);
  165. end;
  166. '&':
  167. begin
  168. Inc(BufferPos);
  169. EnterNewScannerContext(scEntityReference);
  170. end;
  171. '<':
  172. begin
  173. Inc(BufferPos);
  174. EnterNewScannerContext(scTag);
  175. end;
  176. else
  177. FScannerContext := scText;
  178. end;
  179. scText:
  180. case Buffer[BufferPos] of
  181. '&':
  182. begin
  183. Inc(BufferPos);
  184. EnterNewScannerContext(scEntityReference);
  185. end;
  186. '<':
  187. begin
  188. Inc(BufferPos);
  189. EnterNewScannerContext(scTag);
  190. end;
  191. else
  192. begin
  193. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  194. Inc(BufferPos);
  195. end;
  196. end;
  197. scCData:
  198. if (Length(FRawTokenText) = 0) and (Buffer[BufferPos] = '-') then
  199. begin
  200. Inc(BufferPos);
  201. EnterNewScannerContext(scComment);
  202. end
  203. else if (Buffer[BufferPos] = '>') and (RightStr(FRawTokenText, 2) = ']]') then
  204. begin
  205. FRawTokenText := Copy(FRawTokenText, 8, Length(FRawTokenText)-9); //delete '[CDATA[' and ']]' from text
  206. Inc(BufferPos);
  207. EnterNewScannerContext(scUnknown);
  208. end
  209. else
  210. begin
  211. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  212. Inc(BufferPos);
  213. end;
  214. scComment:
  215. if (Buffer[BufferPos] = '>') and (RightStr(FRawTokenText, 2) = '--') then
  216. begin
  217. FRawTokenText := Copy(FRawTokenText, 2, Length(FRawTokenText)-3); //delete '-' and '--' from text
  218. Inc(BufferPos);
  219. EnterNewScannerContext(scUnknown);
  220. end
  221. else
  222. begin
  223. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  224. Inc(BufferPos);
  225. end;
  226. scEntityReference:
  227. if Buffer[BufferPos] = ';' then
  228. begin
  229. Inc(BufferPos);
  230. EnterNewScannerContext(scUnknown);
  231. end else if not (Buffer[BufferPos] in
  232. ['a'..'z', 'A'..'Z', '0'..'9', '#']) then
  233. EnterNewScannerContext(scUnknown)
  234. else
  235. begin
  236. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  237. Inc(BufferPos);
  238. end;
  239. scTag:
  240. case Buffer[BufferPos] of
  241. '''', '"':
  242. begin
  243. if FAttrNameRead then
  244. begin
  245. if FCurStringValueDelimiter = #0 then
  246. FCurStringValueDelimiter := Buffer[BufferPos]
  247. else if FCurStringValueDelimiter = Buffer[BufferPos] then
  248. begin
  249. FCurStringValueDelimiter := #0;
  250. FAttrNameRead := False;
  251. end;
  252. end;
  253. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  254. Inc(BufferPos);
  255. end;
  256. '=':
  257. begin
  258. FAttrNameRead := True;
  259. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  260. Inc(BufferPos);
  261. end;
  262. '!':
  263. begin
  264. Inc(BufferPos);
  265. EnterNewScannerContext(scCData);
  266. end;
  267. '>':
  268. begin
  269. Inc(BufferPos);
  270. if FCurStringValueDelimiter = #0 then
  271. EnterNewScannerContext(scUnknown);
  272. end;
  273. else
  274. begin
  275. FRawTokenText := FRawTokenText + Buffer[BufferPos];
  276. Inc(BufferPos);
  277. end;
  278. end;
  279. end; // case ScannerContext of
  280. end; // while not endOfBuffer
  281. end;
  282. end;
  283. function SplitTagString(const s: SAXString; var Attr: TSAXAttributes): SAXString;
  284. var
  285. i, j: Integer;
  286. AttrName: SAXString;
  287. ValueDelimiter: WideChar;
  288. DoIncJ: Boolean;
  289. begin
  290. Attr := nil;
  291. i := 0;
  292. repeat
  293. Inc(i)
  294. until (i > Length(s)) or IsXMLWhitespace(s[i]);
  295. if i > Length(s) then
  296. Result := s
  297. else
  298. begin
  299. Result := Copy(s, 1, i - 1);
  300. Attr := TSAXAttributes.Create;
  301. Inc(i);
  302. while (i <= Length(s)) and IsXMLWhitespace(s[i]) do
  303. Inc(i);
  304. SetLength(AttrName, 0);
  305. j := i;
  306. while j <= Length(s) do
  307. if s[j] = '=' then
  308. begin
  309. AttrName := Copy(s, i, j - i);
  310. Inc(j);
  311. if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
  312. begin
  313. ValueDelimiter := s[j];
  314. Inc(j);
  315. end else
  316. ValueDelimiter := #0;
  317. i := j;
  318. DoIncJ := False;
  319. while j <= Length(s) do
  320. if ValueDelimiter = #0 then
  321. if IsXMLWhitespace(s[j]) then
  322. break
  323. else
  324. Inc(j)
  325. else if s[j] = ValueDelimiter then
  326. begin
  327. DoIncJ := True;
  328. break
  329. end else
  330. Inc(j);
  331. if IsXMLName(AttrName) then
  332. Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
  333. if DoIncJ then
  334. Inc(j);
  335. while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
  336. Inc(j);
  337. i := j;
  338. end
  339. else if IsXMLWhitespace(s[j]) then
  340. begin
  341. if IsXMLName(@s[i], j-i) then
  342. Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
  343. Inc(j);
  344. while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
  345. Inc(j);
  346. i := j;
  347. end else
  348. Inc(j);
  349. end;
  350. end;
  351. procedure TSAXXMLReader.EnterNewScannerContext(NewContext: TXMLScannerContext);
  352. var
  353. Attr: TSAXAttributes;
  354. TagName: SAXString;
  355. Ent: SAXChar;
  356. begin
  357. FTokenText := FRawTokenText; // this is where conversion takes place
  358. case ScannerContext of
  359. scWhitespace:
  360. DoIgnorableWhitespace(PSAXChar(TokenText), 0, Length(TokenText));
  361. scText,
  362. scCData:
  363. DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
  364. scComment:
  365. DoComment(PSAXChar(TokenText), 0, Length(TokenText));
  366. scEntityReference:
  367. begin
  368. if (Length(TokenText) >= 2) and (TokenText[1] = '#') and
  369. (((TokenText[2] >= '0') and (TokenText[2] <= '9')) or (TokenText[2]='x')) and
  370. // here actually using it to resolve character references
  371. ResolveHTMLEntityReference(TokenText, Ent) then
  372. DoCharacters(@Ent, 0, 1)
  373. else if TokenText = 'lt' then
  374. DoCharacters(@char_lt, 0, 1)
  375. else if TokenText = 'gt' then
  376. DoCharacters(@char_gt, 0, 1)
  377. else if TokenText = 'amp' then
  378. DoCharacters(@char_amp, 0, 1)
  379. else if TokenText = 'quot' then
  380. DoCharacters(@char_quot, 0, 1)
  381. else if TokenText = 'apos' then
  382. DoCharacters(@char_apos, 0, 1)
  383. else
  384. DoSkippedEntity(TokenText);
  385. end;
  386. scTag:
  387. if Length(TokenText) > 0 then
  388. begin
  389. Attr := nil;
  390. if TokenText[Length(fTokenText)]='/' then // handle empty tag
  391. begin
  392. setlength(fTokenText,length(fTokenText)-1);
  393. // Do NOT combine to a single line, as Attr is an output value!
  394. TagName := SplitTagString(TokenText, Attr);
  395. DoStartElement('', TagName, '', Attr);
  396. DoEndElement('', TagName, '');
  397. end
  398. else if TokenText[1] = '/' then
  399. begin
  400. DoEndElement('',
  401. SplitTagString(Copy(TokenText, 2, Length(TokenText)), Attr), '');
  402. end
  403. else if (TokenText[1] <> '!') and (TokenText[1] <> '?') then
  404. begin
  405. // Do NOT combine to a single line, as Attr is an output value!
  406. TagName := SplitTagString(TokenText, Attr);
  407. DoStartElement('', TagName, '', Attr);
  408. end;
  409. if Assigned(Attr) then
  410. Attr.Free;
  411. end;
  412. end;
  413. FScannerContext := NewContext;
  414. FTokenText := '';
  415. FRawTokenText := '';
  416. FCurStringValueDelimiter := #0;
  417. FAttrNameRead := False;
  418. end;
  419. { TXMLToDOMConverter }
  420. constructor TXMLToDOMConverter.Create(AReader: TSAXXMLReader;
  421. ADocument: TDOMDocument);
  422. begin
  423. inherited Create;
  424. FReader := AReader;
  425. FReader.OnCharacters := @ReaderCharacters;
  426. FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
  427. FReader.OnSkippedEntity := @ReaderSkippedEntity;
  428. FReader.OnStartElement := @ReaderStartElement;
  429. FReader.OnEndElement := @ReaderEndElement;
  430. FDocument := ADocument;
  431. FElementStack := TList.Create;
  432. FNodeBuffer := TList.Create;
  433. end;
  434. constructor TXMLToDOMConverter.CreateFragment(AReader: TSAXXMLReader;
  435. AFragmentRoot: TDOMNode);
  436. begin
  437. Create(AReader, AFragmentRoot.OwnerDocument);
  438. FragmentRoot := AFragmentRoot;
  439. IsFragmentMode := True;
  440. end;
  441. destructor TXMLToDOMConverter.Destroy;
  442. var
  443. i: Integer;
  444. begin
  445. // Theoretically, always exactly one item will remain - the root element:
  446. for i := 0 to FNodeBuffer.Count - 1 do
  447. TXMLNodeInfo(FNodeBuffer[i]).Free;
  448. FNodeBuffer.Free;
  449. FElementStack.Free;
  450. inherited Destroy;
  451. end;
  452. procedure TXMLToDOMConverter.ReaderCharacters(Sender: TObject;
  453. const ch: PSAXChar; Start, Count: Integer);
  454. var
  455. NodeInfo: TXMLNodeInfo;
  456. begin
  457. NodeInfo := TXMLNodeInfo.Create;
  458. NodeInfo.NodeType := ntText;
  459. NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
  460. FNodeBuffer.Add(NodeInfo);
  461. end;
  462. procedure TXMLToDOMConverter.ReaderComment(Sender: TObject;
  463. const ch: PSAXChar; Start, Count: Integer);
  464. var
  465. NodeInfo: TXMLNodeInfo;
  466. begin
  467. NodeInfo := TXMLNodeInfo.Create;
  468. NodeInfo.NodeType := ntComment;
  469. NodeInfo.DOMNode := FDocument.CreateCommentBuf(ch, Count);
  470. FNodeBuffer.Add(NodeInfo);
  471. end;
  472. procedure TXMLToDOMConverter.ReaderIgnorableWhitespace(Sender: TObject;
  473. const ch: PSAXChar; Start, Count: Integer);
  474. var
  475. NodeInfo: TXMLNodeInfo;
  476. begin
  477. NodeInfo := TXMLNodeInfo.Create;
  478. NodeInfo.NodeType := ntWhitespace;
  479. NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
  480. FNodeBuffer.Add(NodeInfo);
  481. end;
  482. procedure TXMLToDOMConverter.ReaderSkippedEntity(Sender: TObject;
  483. const Name: SAXString);
  484. var
  485. NodeInfo: TXMLNodeInfo;
  486. begin
  487. NodeInfo := TXMLNodeInfo.Create;
  488. NodeInfo.NodeType := ntEntityReference;
  489. NodeInfo.DOMNode := FDocument.CreateEntityReference(Name);
  490. FNodeBuffer.Add(NodeInfo);
  491. end;
  492. procedure TXMLToDOMConverter.ReaderStartElement(Sender: TObject;
  493. const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
  494. var
  495. NodeInfo: TXMLNodeInfo;
  496. Element: TDOMElement;
  497. i: Integer;
  498. begin
  499. // WriteLn('Start: ', LocalName, '. Node buffer before: ', FNodeBuffer.Count, ' elements');
  500. Element := FDocument.CreateElement(LocalName);
  501. if Assigned(Attr) then
  502. begin
  503. // WriteLn('Attribute: ', Attr.GetLength);
  504. for i := 0 to Attr.GetLength - 1 do
  505. begin
  506. // WriteLn('#', i, ': LocalName = ', Attr.GetLocalName(i), ', Value = ', Attr.GetValue(i));
  507. Element[Attr.GetLocalName(i)] := Attr.GetValue(i);
  508. end;
  509. end;
  510. NodeInfo := TXMLNodeInfo.Create;
  511. NodeInfo.NodeType := ntTag;
  512. NodeInfo.DOMNode := Element;
  513. if IsFragmentMode then
  514. begin
  515. if not FragmentRootSet then
  516. begin
  517. FragmentRoot.AppendChild(Element);
  518. FragmentRootSet := True;
  519. end;
  520. end else
  521. if not Assigned(FDocument.DocumentElement) then
  522. FDocument.AppendChild(Element);
  523. FNodeBuffer.Add(NodeInfo);
  524. // WriteLn('Start: ', LocalName, '. Node buffer after: ', FNodeBuffer.Count, ' elements');
  525. end;
  526. procedure TXMLToDOMConverter.ReaderEndElement(Sender: TObject;
  527. const NamespaceURI, LocalName, RawName: SAXString);
  528. var
  529. NodeInfo, NodeInfo2: TXMLNodeInfo;
  530. i : Integer;
  531. begin
  532. // WriteLn('End: ', LocalName, '. Node buffer: ', FNodeBuffer.Count, ' elements');
  533. // Find the matching start tag
  534. i := FNodeBuffer.Count - 1;
  535. while i >= 0 do
  536. begin
  537. NodeInfo := TXMLNodeInfo(FNodeBuffer.Items[i]);
  538. if (NodeInfo.NodeType = ntTag) and
  539. (CompareText(NodeInfo.DOMNode.NodeName, LocalName) = 0) then
  540. begin
  541. // We found the matching start tag
  542. Inc(i);
  543. while i < FNodeBuffer.Count do
  544. begin
  545. NodeInfo2 := TXMLNodeInfo(FNodeBuffer.Items[i]);
  546. NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode);
  547. NodeInfo2.Free;
  548. FNodeBuffer.Delete(i);
  549. end;
  550. break;
  551. end;
  552. Dec(i);
  553. end;
  554. end;
  555. procedure ReadXMLFile(out ADoc: TXMLDocument; const AFilename: String);
  556. var
  557. f: TStream;
  558. begin
  559. ADoc := nil;
  560. f := TFileStream.Create(AFilename, fmOpenRead);
  561. try
  562. ReadXMLFile(ADoc, f);
  563. finally
  564. f.Free;
  565. end;
  566. end;
  567. procedure ReadXMLFile(out ADoc: TXMLDocument; f: TStream);
  568. var
  569. Reader: TSAXXMLReader;
  570. Converter: TXMLToDOMConverter;
  571. begin
  572. ADoc := TXMLDocument.Create;
  573. Reader := TSAXXMLReader.Create;
  574. try
  575. Converter := TXMLToDOMConverter.Create(Reader, ADoc);
  576. try
  577. Reader.ParseStream(f);
  578. finally
  579. Converter.Free;
  580. end;
  581. finally
  582. Reader.Free;
  583. end;
  584. end;
  585. procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String);
  586. var
  587. f: TStream;
  588. begin
  589. f := TFileStream.Create(AFilename, fmOpenRead);
  590. try
  591. ReadXMLFragment(AParentNode, f);
  592. finally
  593. f.Free;
  594. end;
  595. end;
  596. procedure ReadXMLFragment(AParentNode: TDOMNode; f: TStream);
  597. var
  598. Reader: TSAXXMLReader;
  599. Converter: TXMLToDOMConverter;
  600. begin
  601. Reader := TSAXXMLReader.Create;
  602. try
  603. Converter := TXMLToDOMConverter.CreateFragment(Reader, AParentNode);
  604. try
  605. Reader.ParseStream(f);
  606. finally
  607. Converter.Free;
  608. end;
  609. finally
  610. Reader.Free;
  611. end;
  612. end;
  613. end.