123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616 |
- {
- This file is part of the Free Component Library
- Copyright (c) 2006 by Michael Van Canneyt.
- Based on SAX_HTML implementation from Sebastian Guenther.
- XML parser with SAX interface
- See the file COPYING.FPC, included in this distribution,
- for details about the copyright.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- **********************************************************************}
- {$mode objfpc}
- {$h+}
- unit SAX_XML;
- interface
- uses SysUtils, Classes, SAX, DOM;
- type
- { TXMLReader: The XML reader class }
- TXMLScannerContext = (
- scUnknown,
- scWhitespace, // within whitespace
- scText, // within text
- scEntityReference, // within entity reference ("&...;")
- scTag); // within a start tag or end tag
- TSAXXMLReader = class(TSAXReader)
- private
- FStarted: Boolean;
- FEndOfStream: Boolean;
- FScannerContext: TXMLScannerContext;
- FTokenText: SAXString;
- FRawTokenText: string;
- FCurStringValueDelimiter: Char;
- FAttrNameRead: Boolean;
- protected
- procedure EnterNewScannerContext(NewContext: TXMLScannerContext);
- public
- constructor Create;
- destructor Destroy; override;
- procedure Parse(AInput: TSAXInputSource); override; overload;
- property EndOfStream: Boolean read FEndOfStream;
- property ScannerContext: TXMLScannerContext read FScannerContext;
- property TokenText: SAXString read FTokenText;
- end;
- { TXMLToDOMConverter }
- TXMLNodeType = (ntWhitespace, ntText, ntEntityReference, ntTag);
- TXMLNodeInfo = class
- NodeType: TXMLNodeType;
- DOMNode: TDOMNode;
- end;
- TXMLToDOMConverter = class
- private
- FReader: TSAXXMLReader;
- FDocument: TDOMDocument;
- FElementStack: TList;
- FNodeBuffer: TList;
- IsFragmentMode, FragmentRootSet: Boolean;
- FragmentRoot: TDOMNode;
- procedure ReaderCharacters(Sender: TObject; const ch: PSAXChar;
- Start, Count: Integer);
- procedure ReaderIgnorableWhitespace(Sender: TObject; const ch: PSAXChar;
- Start, Count: Integer);
- procedure ReaderSkippedEntity(Sender: TObject; const Name: SAXString);
- procedure ReaderStartElement(Sender: TObject;
- const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
- procedure ReaderEndElement(Sender: TObject;
- const NamespaceURI, LocalName, RawName: SAXString);
- public
- constructor Create(AReader: TSAXXMLReader; ADocument: TDOMDocument);
- constructor CreateFragment(AReader: TSAXXMLReader; AFragmentRoot: TDOMNode);
- destructor Destroy; override;
- end;
- // Helper functions; these ones are XML equivalents of ReadXML[File|Fragment]
- procedure ReadXMLFile(var ADoc: TXMLDocument; const AFilename: String);
- procedure ReadXMLFile(var ADoc: TXMLDocument; var f: TStream);
- procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String);
- procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream);
- implementation
- uses
- xmlutils,
- htmldefs; // for entities...
- const
- WhitespaceChars = [#9, #10, #13, ' '];
- char_lt: SAXChar = '<';
- char_gt: SAXChar = '>';
- char_quot: SAXChar = '"';
- char_apos: SAXChar = '''';
- char_amp: SAXChar = '&';
- constructor TSAXXMLReader.Create;
- begin
- inherited Create;
- FScannerContext := scUnknown;
- end;
- destructor TSAXXMLReader.Destroy;
- begin
- if FStarted then
- DoEndDocument;
- inherited Destroy;
- end;
- procedure TSAXXMLReader.Parse(AInput: TSAXInputSource);
- const
- MaxBufferSize = 1024;
- var
- Buffer: array[0..MaxBufferSize - 1] of Char;
- BufferSize, BufferPos: Integer;
- begin
- if not FStarted then
- begin
- FStarted := True;
- DoStartDocument;
- end;
- FEndOfStream := False;
- FStopFlag := False;
- while not FStopFlag do
- begin
- // Read data into the input buffer
- BufferSize := AInput.Stream.Read(Buffer, MaxBufferSize);
- if BufferSize = 0 then
- begin
- FEndOfStream := True;
- break;
- end;
- BufferPos := 0;
- while (BufferPos < BufferSize) and not FStopFlag do
- begin
- case ScannerContext of
- scUnknown:
- case Buffer[BufferPos] of
- #9, #10, #13, ' ':
- EnterNewScannerContext(scWhitespace);
- '&':
- begin
- Inc(BufferPos);
- EnterNewScannerContext(scEntityReference);
- end;
- '<':
- begin
- Inc(BufferPos);
- EnterNewScannerContext(scTag);
- end;
- else
- EnterNewScannerContext(scText);
- end;
- scWhitespace:
- case Buffer[BufferPos] of
- #9, #10, #13, ' ':
- begin
- FRawTokenText := FRawTokenText + Buffer[BufferPos];
- Inc(BufferPos);
- end;
- '&':
- begin
- Inc(BufferPos);
- EnterNewScannerContext(scEntityReference);
- end;
- '<':
- begin
- Inc(BufferPos);
- EnterNewScannerContext(scTag);
- end;
- else
- FScannerContext := scText;
- end;
- scText:
- case Buffer[BufferPos] of
- '&':
- begin
- Inc(BufferPos);
- EnterNewScannerContext(scEntityReference);
- end;
- '<':
- begin
- Inc(BufferPos);
- EnterNewScannerContext(scTag);
- end;
- else
- begin
- FRawTokenText := FRawTokenText + Buffer[BufferPos];
- Inc(BufferPos);
- end;
- end;
- scEntityReference:
- if Buffer[BufferPos] = ';' then
- begin
- Inc(BufferPos);
- EnterNewScannerContext(scUnknown);
- end else if not (Buffer[BufferPos] in
- ['a'..'z', 'A'..'Z', '0'..'9', '#']) then
- EnterNewScannerContext(scUnknown)
- else
- begin
- FRawTokenText := FRawTokenText + Buffer[BufferPos];
- Inc(BufferPos);
- end;
- scTag:
- case Buffer[BufferPos] of
- '''', '"':
- begin
- if FAttrNameRead then
- begin
- if FCurStringValueDelimiter = #0 then
- FCurStringValueDelimiter := Buffer[BufferPos]
- else if FCurStringValueDelimiter = Buffer[BufferPos] then
- begin
- FCurStringValueDelimiter := #0;
- FAttrNameRead := False;
- end;
- end;
- FRawTokenText := FRawTokenText + Buffer[BufferPos];
- Inc(BufferPos);
- end;
- '=':
- begin
- FAttrNameRead := True;
- FRawTokenText := FRawTokenText + Buffer[BufferPos];
- Inc(BufferPos);
- end;
- '>':
- begin
- Inc(BufferPos);
- if FCurStringValueDelimiter = #0 then
- EnterNewScannerContext(scUnknown);
- end;
- else
- begin
- FRawTokenText := FRawTokenText + Buffer[BufferPos];
- Inc(BufferPos);
- end;
- end;
- end; // case ScannerContext of
- end; // while not endOfBuffer
- end;
- end;
- function SplitTagString(const s: SAXString; var Attr: TSAXAttributes): SAXString;
- var
- i, j: Integer;
- AttrName: SAXString;
- ValueDelimiter: WideChar;
- DoIncJ: Boolean;
- begin
- Attr := nil;
- i := 0;
- repeat
- Inc(i)
- until (i > Length(s)) or IsXMLWhitespace(s[i]);
- if i > Length(s) then
- Result := s
- else
- begin
- Result := Copy(s, 1, i - 1);
- Attr := TSAXAttributes.Create;
- Inc(i);
- while (i <= Length(s)) and IsXMLWhitespace(s[i]) do
- Inc(i);
- SetLength(AttrName, 0);
- j := i;
- while j <= Length(s) do
- if s[j] = '=' then
- begin
- AttrName := Copy(s, i, j - i);
- Inc(j);
- if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
- begin
- ValueDelimiter := s[j];
- Inc(j);
- end else
- ValueDelimiter := #0;
- i := j;
- DoIncJ := False;
- while j <= Length(s) do
- if ValueDelimiter = #0 then
- if IsXMLWhitespace(s[j]) then
- break
- else
- Inc(j)
- else if s[j] = ValueDelimiter then
- begin
- DoIncJ := True;
- break
- end else
- Inc(j);
- if IsXMLName(AttrName) then
- Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
- if DoIncJ then
- Inc(j);
- while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
- Inc(j);
- i := j;
- end
- else if IsXMLWhitespace(s[j]) then
- begin
- if IsXMLName(@s[i], j-i) then
- Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
- Inc(j);
- while (j <= Length(s)) and IsXMLWhitespace(s[j]) do
- Inc(j);
- i := j;
- end else
- Inc(j);
- end;
- end;
- procedure TSAXXMLReader.EnterNewScannerContext(NewContext: TXMLScannerContext);
- var
- Attr: TSAXAttributes;
- TagName: SAXString;
- Ent: SAXChar;
- begin
- FTokenText := FRawTokenText; // this is where conversion takes place
- case ScannerContext of
- scWhitespace:
- DoIgnorableWhitespace(PSAXChar(TokenText), 0, Length(TokenText));
- scText:
- DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
- scEntityReference:
- begin
- if (Length(TokenText) >= 2) and (TokenText[1] = '#') and
- (((TokenText[2] >= '0') and (TokenText[2] <= '9')) or (TokenText[2]='x')) and
- // here actually using it to resolve character references
- ResolveHTMLEntityReference(TokenText, Ent) then
- DoCharacters(@Ent, 0, 1)
- else if TokenText = 'lt' then
- DoCharacters(@char_lt, 0, 1)
- else if TokenText = 'gt' then
- DoCharacters(@char_gt, 0, 1)
- else if TokenText = 'amp' then
- DoCharacters(@char_amp, 0, 1)
- else if TokenText = 'quot' then
- DoCharacters(@char_quot, 0, 1)
- else if TokenText = 'apos' then
- DoCharacters(@char_apos, 0, 1)
- else
- DoSkippedEntity(TokenText);
- end;
- scTag:
- if Length(TokenText) > 0 then
- begin
- Attr := nil;
- if TokenText[Length(fTokenText)]='/' then // handle empty tag
- begin
- setlength(fTokenText,length(fTokenText)-1);
- // Do NOT combine to a single line, as Attr is an output value!
- TagName := SplitTagString(TokenText, Attr);
- DoStartElement('', TagName, '', Attr);
- DoEndElement('', TagName, '');
- end
- else if TokenText[1] = '/' then
- begin
- DoEndElement('',
- SplitTagString(Copy(TokenText, 2, Length(TokenText)), Attr), '');
- end
- else if TokenText[1] <> '!' then
- begin
- // Do NOT combine to a single line, as Attr is an output value!
- TagName := SplitTagString(TokenText, Attr);
- DoStartElement('', TagName, '', Attr);
- end;
- if Assigned(Attr) then
- Attr.Free;
- end;
- end;
- FScannerContext := NewContext;
- FTokenText := '';
- FRawTokenText := '';
- FCurStringValueDelimiter := #0;
- FAttrNameRead := False;
- end;
- { TXMLToDOMConverter }
- constructor TXMLToDOMConverter.Create(AReader: TSAXXMLReader;
- ADocument: TDOMDocument);
- begin
- inherited Create;
- FReader := AReader;
- FReader.OnCharacters := @ReaderCharacters;
- FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
- FReader.OnSkippedEntity := @ReaderSkippedEntity;
- FReader.OnStartElement := @ReaderStartElement;
- FReader.OnEndElement := @ReaderEndElement;
- FDocument := ADocument;
- FElementStack := TList.Create;
- FNodeBuffer := TList.Create;
- end;
- constructor TXMLToDOMConverter.CreateFragment(AReader: TSAXXMLReader;
- AFragmentRoot: TDOMNode);
- begin
- Create(AReader, AFragmentRoot.OwnerDocument);
- FragmentRoot := AFragmentRoot;
- IsFragmentMode := True;
- end;
- destructor TXMLToDOMConverter.Destroy;
- var
- i: Integer;
- begin
- // Theoretically, always exactly one item will remain - the root element:
- for i := 0 to FNodeBuffer.Count - 1 do
- TXMLNodeInfo(FNodeBuffer[i]).Free;
- FNodeBuffer.Free;
- FElementStack.Free;
- inherited Destroy;
- end;
- procedure TXMLToDOMConverter.ReaderCharacters(Sender: TObject;
- const ch: PSAXChar; Start, Count: Integer);
- var
- NodeInfo: TXMLNodeInfo;
- begin
- NodeInfo := TXMLNodeInfo.Create;
- NodeInfo.NodeType := ntText;
- NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
- FNodeBuffer.Add(NodeInfo);
- end;
- procedure TXMLToDOMConverter.ReaderIgnorableWhitespace(Sender: TObject;
- const ch: PSAXChar; Start, Count: Integer);
- var
- NodeInfo: TXMLNodeInfo;
- begin
- NodeInfo := TXMLNodeInfo.Create;
- NodeInfo.NodeType := ntWhitespace;
- NodeInfo.DOMNode := FDocument.CreateTextNodeBuf(ch, Count, False);
- FNodeBuffer.Add(NodeInfo);
- end;
- procedure TXMLToDOMConverter.ReaderSkippedEntity(Sender: TObject;
- const Name: SAXString);
- var
- NodeInfo: TXMLNodeInfo;
- begin
- NodeInfo := TXMLNodeInfo.Create;
- NodeInfo.NodeType := ntEntityReference;
- NodeInfo.DOMNode := FDocument.CreateEntityReference(Name);
- FNodeBuffer.Add(NodeInfo);
- end;
- procedure TXMLToDOMConverter.ReaderStartElement(Sender: TObject;
- const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
- var
- NodeInfo: TXMLNodeInfo;
- Element: TDOMElement;
- i: Integer;
- begin
- // WriteLn('Start: ', LocalName, '. Node buffer before: ', FNodeBuffer.Count, ' elements');
- Element := FDocument.CreateElement(LocalName);
- if Assigned(Attr) then
- begin
- // WriteLn('Attribute: ', Attr.GetLength);
- for i := 0 to Attr.GetLength - 1 do
- begin
- // WriteLn('#', i, ': LocalName = ', Attr.GetLocalName(i), ', Value = ', Attr.GetValue(i));
- Element[Attr.GetLocalName(i)] := Attr.GetValue(i);
- end;
- end;
- NodeInfo := TXMLNodeInfo.Create;
- NodeInfo.NodeType := ntTag;
- NodeInfo.DOMNode := Element;
- if IsFragmentMode then
- begin
- if not FragmentRootSet then
- begin
- FragmentRoot.AppendChild(Element);
- FragmentRootSet := True;
- end;
- end else
- if not Assigned(FDocument.DocumentElement) then
- FDocument.AppendChild(Element);
- FNodeBuffer.Add(NodeInfo);
- // WriteLn('Start: ', LocalName, '. Node buffer after: ', FNodeBuffer.Count, ' elements');
- end;
- procedure TXMLToDOMConverter.ReaderEndElement(Sender: TObject;
- const NamespaceURI, LocalName, RawName: SAXString);
- var
- NodeInfo, NodeInfo2: TXMLNodeInfo;
- i : Integer;
- begin
- // WriteLn('End: ', LocalName, '. Node buffer: ', FNodeBuffer.Count, ' elements');
- // Find the matching start tag
- i := FNodeBuffer.Count - 1;
- while i >= 0 do
- begin
- NodeInfo := TXMLNodeInfo(FNodeBuffer.Items[i]);
- if (NodeInfo.NodeType = ntTag) and
- (CompareText(NodeInfo.DOMNode.NodeName, LocalName) = 0) then
- begin
- // We found the matching start tag
- Inc(i);
- while i < FNodeBuffer.Count do
- begin
- NodeInfo2 := TXMLNodeInfo(FNodeBuffer.Items[i]);
- NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode);
- NodeInfo2.Free;
- FNodeBuffer.Delete(i);
- end;
- break;
- end;
- Dec(i);
- end;
- end;
- procedure ReadXMLFile(var ADoc: TXMLDocument; const AFilename: String);
- var
- f: TStream;
- begin
- ADoc := nil;
- f := TFileStream.Create(AFilename, fmOpenRead);
- try
- ReadXMLFile(ADoc, f);
- finally
- f.Free;
- end;
- end;
- procedure ReadXMLFile(var ADoc: TXMLDocument; var f: TStream);
- var
- Reader: TSAXXMLReader;
- Converter: TXMLToDOMConverter;
- begin
- ADoc := TXMLDocument.Create;
- Reader := TSAXXMLReader.Create;
- try
- Converter := TXMLToDOMConverter.Create(Reader, ADoc);
- try
- Reader.ParseStream(f);
- finally
- Converter.Free;
- end;
- finally
- Reader.Free;
- end;
- end;
- procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String);
- var
- f: TStream;
- begin
- f := TFileStream.Create(AFilename, fmOpenRead);
- try
- ReadXMLFragment(AParentNode, f);
- finally
- f.Free;
- end;
- end;
- procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream);
- var
- Reader: TSAXXMLReader;
- Converter: TXMLToDOMConverter;
- begin
- Reader := TSAXXMLReader.Create;
- try
- Converter := TXMLToDOMConverter.CreateFragment(Reader, AParentNode);
- try
- Reader.ParseStream(f);
- finally
- Converter.Free;
- end;
- finally
- Reader.Free;
- end;
- end;
- end.
|