Browse Source

* Initial implementation

git-svn-id: trunk@5596 -
michael 18 years ago
parent
commit
e0935d347a
2 changed files with 613 additions and 0 deletions
  1. 1 0
      .gitattributes
  2. 612 0
      fcl/xml/sax_xml.pp

+ 1 - 0
.gitattributes

@@ -1177,6 +1177,7 @@ fcl/xml/htmwrite.pp svneol=native#text/plain
 fcl/xml/names.inc svneol=native#text/plain
 fcl/xml/sax.pp svneol=native#text/plain
 fcl/xml/sax_html.pp svneol=native#text/plain
+fcl/xml/sax_xml.pp svneol=native#text/plain
 fcl/xml/xhtml.pp svneol=native#text/plain
 fcl/xml/xmlcfg.pp svneol=native#text/plain
 fcl/xml/xmlread.pp svneol=native#text/plain

+ 612 - 0
fcl/xml/sax_xml.pp

@@ -0,0 +1,612 @@
+{
+    This file is part of the Free Component Library
+    Copyright (c) 2006 by Michael Van Canneyt.
+    Based on SAX_HTML implementation from Sebastian Guenther.
+
+    XML parser with SAX interface
+
+    See the file COPYING.FPC, included in this distribution,
+    for details about the copyright.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ **********************************************************************}
+
+unit SAX_XML;
+
+interface
+
+uses SysUtils, Classes, SAX, DOM;
+
+type
+
+{ TXMLReader: The XML reader class }
+
+  TXMLScannerContext = (
+    scUnknown,
+    scWhitespace,       // within whitespace
+    scText,             // within text
+    scEntityReference,  // within entity reference ("&...;")
+    scTag);             // within a start tag or end tag
+
+  TSAXXMLReader = class(TSAXReader)
+  private
+    FStarted: Boolean;
+    FEndOfStream: Boolean;
+    FScannerContext: TXMLScannerContext;
+    FTokenText: SAXString;
+    FCurStringValueDelimiter: Char;
+    FAttrNameRead: Boolean;
+  protected
+    procedure EnterNewScannerContext(NewContext: TXMLScannerContext);
+  public
+    constructor Create;
+    destructor Destroy; override;
+
+    procedure Parse(AInput: TSAXInputSource); override; overload;
+
+    property EndOfStream: Boolean read FEndOfStream;
+    property ScannerContext: TXMLScannerContext read FScannerContext;
+    property TokenText: SAXString read FTokenText;
+  end;
+
+
+{ TXMLToDOMConverter }
+
+  TXMLNodeType = (ntWhitespace, ntText, ntEntityReference, ntTag);
+
+  TXMLNodeInfo = class
+    NodeType: TXMLNodeType;
+    DOMNode: TDOMNode;
+  end;
+
+  TXMLToDOMConverter = class
+  private
+    FReader: TSAXXMLReader;
+    FDocument: TDOMDocument;
+    FElementStack: TList;
+    FNodeBuffer: TList;
+    IsFragmentMode, FragmentRootSet: Boolean;
+    FragmentRoot: TDOMNode;
+
+    procedure ReaderCharacters(Sender: TObject; const ch: PSAXChar;
+      Start, Count: Integer);
+    procedure ReaderIgnorableWhitespace(Sender: TObject; const ch: PSAXChar;
+      Start, Count: Integer);
+    procedure ReaderSkippedEntity(Sender: TObject; const Name: SAXString);
+    procedure ReaderStartElement(Sender: TObject;
+      const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
+    procedure ReaderEndElement(Sender: TObject;
+      const NamespaceURI, LocalName, RawName: SAXString);
+
+  public
+    constructor Create(AReader: TSAXXMLReader; ADocument: TDOMDocument);
+    constructor CreateFragment(AReader: TSAXXMLReader; AFragmentRoot: TDOMNode);
+    destructor Destroy; override;
+  end;
+
+
+// Helper functions; these ones are XML equivalents of ReadXML[File|Fragment]
+
+procedure ReadXMLFile(var ADoc: TXMLDocument; const AFilename: String);
+procedure ReadXMLFile(var ADoc: TXMLDocument; var f: TStream);
+
+procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String);
+procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream);
+
+
+
+implementation
+
+uses htmldefs; // for entities...
+
+const
+  WhitespaceChars = [#9, #10, #13, ' '];
+
+
+constructor TSAXXMLReader.Create;
+begin
+  inherited Create;
+  FScannerContext := scUnknown;
+end;
+
+destructor TSAXXMLReader.Destroy;
+begin
+  if FStarted then
+    DoEndDocument;
+  inherited Destroy;
+end;
+
+procedure TSAXXMLReader.Parse(AInput: TSAXInputSource);
+const
+  MaxBufferSize = 1024;
+var
+  Buffer: array[0..MaxBufferSize - 1] of Char;
+  BufferSize, BufferPos: Integer;
+begin
+  if not FStarted then
+  begin
+    FStarted := True;
+    DoStartDocument;
+  end;
+
+  FEndOfStream := False;
+  while True do
+  begin
+    // Read data into the input buffer
+    BufferSize := AInput.Stream.Read(Buffer, MaxBufferSize);
+    if BufferSize = 0 then
+    begin
+      FEndOfStream := True;
+      break;
+    end;
+
+    BufferPos := 0;
+    while BufferPos < BufferSize do
+      case ScannerContext of
+        scUnknown:
+          case Buffer[BufferPos] of
+            #9, #10, #13, ' ':
+              EnterNewScannerContext(scWhitespace);
+            '&':
+              begin
+                Inc(BufferPos);
+                EnterNewScannerContext(scEntityReference);
+              end;
+            '<':
+              begin
+                Inc(BufferPos);
+                EnterNewScannerContext(scTag);
+              end;
+            else
+              EnterNewScannerContext(scText);
+          end;
+        scWhitespace:
+          case Buffer[BufferPos] of
+            #9, #10, #13, ' ':
+              begin
+                FTokenText := FTokenText + Buffer[BufferPos];
+                Inc(BufferPos);
+              end;
+            '&':
+              begin
+                Inc(BufferPos);
+                EnterNewScannerContext(scEntityReference);
+              end;
+            '<':
+              begin
+                Inc(BufferPos);
+                EnterNewScannerContext(scTag);
+              end;
+            else
+              EnterNewScannerContext(scText);
+          end;
+        scText:
+          case Buffer[BufferPos] of
+            #9, #10, #13, ' ':
+              EnterNewScannerContext(scWhitespace);
+            '&':
+              begin
+                Inc(BufferPos);
+                EnterNewScannerContext(scEntityReference);
+              end;
+            '<':
+              begin
+                Inc(BufferPos);
+                EnterNewScannerContext(scTag);
+              end;
+            else
+            begin
+              FTokenText := FTokenText + Buffer[BufferPos];
+              Inc(BufferPos);
+            end;
+          end;
+        scEntityReference:
+          if Buffer[BufferPos] = ';' then
+          begin
+            Inc(BufferPos);
+            EnterNewScannerContext(scUnknown);
+          end else if not (Buffer[BufferPos] in
+            ['a'..'z', 'A'..'Z', '0'..'9', '#']) then
+            EnterNewScannerContext(scUnknown)
+          else
+          begin
+            FTokenText := FTokenText + Buffer[BufferPos];
+            Inc(BufferPos);
+          end;
+        scTag:
+          case Buffer[BufferPos] of
+            '''', '"':
+              begin
+                if FAttrNameRead then
+                begin
+                  if FCurStringValueDelimiter = #0 then
+                    FCurStringValueDelimiter := Buffer[BufferPos]
+                  else if FCurStringValueDelimiter = Buffer[BufferPos] then
+                  begin
+                    FCurStringValueDelimiter := #0;
+                    FAttrNameRead := False;
+                  end;
+                end;
+                FTokenText := FTokenText + Buffer[BufferPos];
+                Inc(BufferPos);
+              end;
+            '=':
+              begin
+                FAttrNameRead := True;
+                FTokenText := FTokenText + Buffer[BufferPos];
+                Inc(BufferPos);
+              end;
+            '>':
+              begin
+                Inc(BufferPos);
+                if FCurStringValueDelimiter = #0 then
+                  EnterNewScannerContext(scUnknown);
+              end;
+            else
+            begin
+              FTokenText := FTokenText + Buffer[BufferPos];
+              Inc(BufferPos);
+            end;
+          end;
+      end;
+  end;
+end;
+
+procedure TSAXXMLReader.EnterNewScannerContext(NewContext: TXMLScannerContext);
+
+  function SplitTagString(const s: String; var Attr: TSAXAttributes): String;
+  var
+    i, j: Integer;
+    AttrName: String;
+    ValueDelimiter: Char;
+    DoIncJ: Boolean;
+  begin
+    Attr := nil;
+    i := Pos(' ', s);
+    if i <= 0 then
+      Result := LowerCase(s)
+    else
+    begin
+      Result := LowerCase(Copy(s, 1, i - 1));
+      Attr := TSAXAttributes.Create;
+
+      Inc(i);
+
+      while (i <= Length(s)) and (s[i] in WhitespaceChars) do
+        Inc(i);
+
+      SetLength(AttrName, 0);
+      j := i;
+
+      while j <= Length(s) do
+        if s[j] = '=' then
+        begin
+          AttrName := LowerCase(Copy(s, i, j - i));
+          Inc(j);
+          if (j < Length(s)) and ((s[j] = '''') or (s[j] = '"')) then
+          begin
+            ValueDelimiter := s[j];
+            Inc(j);
+          end else
+            ValueDelimiter := #0;
+          i := j;
+          DoIncJ := False;
+          while j <= Length(s) do
+            if ValueDelimiter = #0 then
+              if s[j] in WhitespaceChars then
+                break
+              else
+                Inc(j)
+            else if s[j] = ValueDelimiter then
+            begin
+              DoIncJ := True;
+              break
+            end else
+              Inc(j);
+
+          Attr.AddAttribute('', AttrName, '', '', Copy(s, i, j - i));
+
+          if DoIncJ then
+            Inc(j);
+
+          while (j <= Length(s)) and (s[j] in WhitespaceChars) do
+            Inc(j);
+          i := j;
+        end
+        else if s[j] in WhitespaceChars then
+        begin
+          Attr.AddAttribute('', Copy(s, i, j - i), '', '', '');
+          Inc(j);
+          while (j <= Length(s)) and (s[j] in WhitespaceChars) do
+            Inc(j);
+          i := j;
+        end else
+          Inc(j);
+    end;
+  end;
+
+var
+  Attr: TSAXAttributes;
+  EntString, TagName: String;
+  Found: Boolean;
+  Ent: Char;
+  i: Integer;
+begin
+  case ScannerContext of
+    scWhitespace:
+      DoIgnorableWhitespace(PSAXChar(TokenText), 1, Length(TokenText));
+    scText:
+      DoCharacters(PSAXChar(TokenText), 0, Length(TokenText));
+    scEntityReference:
+      begin
+        if ResolveHTMLEntityReference(TokenText, Ent) then
+        begin
+          EntString := Ent;
+          DoCharacters(PSAXChar(EntString), 0, 1);
+        end else
+        begin
+          { Is this a predefined Unicode character entity? We must check this,
+            as undefined entities must be handled as text, for compatiblity
+            to popular browsers... }
+          Found := False;
+          for i := Low(UnicodeHTMLEntities) to High(UnicodeHTMLEntities) do
+            if UnicodeHTMLEntities[i] = TokenText then
+            begin
+              Found := True;
+              break;
+            end;
+          if Found then
+            DoSkippedEntity(TokenText)
+          else
+            DoCharacters(PSAXChar('&' + TokenText), 0, Length(TokenText) + 1);
+        end;
+      end;
+    scTag:
+      if Length(TokenText) > 0 then
+      begin
+        Attr := nil;
+        if TokenText[1] = '/' then
+        begin
+          DoEndElement('',
+            SplitTagString(Copy(TokenText, 2, Length(TokenText)), Attr), '');
+        end else if TokenText[1] <> '!' then
+        begin
+          // Do NOT combine to a single line, as Attr is an output value!
+          TagName := SplitTagString(TokenText, Attr);
+          DoStartElement('', TagName, '', Attr);
+        end;
+        if Assigned(Attr) then
+          Attr.Free;
+      end;
+  end;
+  FScannerContext := NewContext;
+  SetLength(FTokenText, 0);
+  FCurStringValueDelimiter := #0;
+  FAttrNameRead := False;
+end;
+
+
+{ TXMLToDOMConverter }
+
+constructor TXMLToDOMConverter.Create(AReader: TSAXXMLReader;
+  ADocument: TDOMDocument);
+begin
+  inherited Create;
+  FReader := AReader;
+  FReader.OnCharacters := @ReaderCharacters;
+  FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
+  FReader.OnSkippedEntity := @ReaderSkippedEntity;
+  FReader.OnStartElement := @ReaderStartElement;
+  FReader.OnEndElement := @ReaderEndElement;
+  FDocument := ADocument;
+  FElementStack := TList.Create;
+  FNodeBuffer := TList.Create;
+end;
+
+constructor TXMLToDOMConverter.CreateFragment(AReader: TSAXXMLReader;
+  AFragmentRoot: TDOMNode);
+begin
+  inherited Create;
+  FReader := AReader;
+  FReader.OnCharacters := @ReaderCharacters;
+  FReader.OnIgnorableWhitespace := @ReaderIgnorableWhitespace;
+  FReader.OnSkippedEntity := @ReaderSkippedEntity;
+  FReader.OnStartElement := @ReaderStartElement;
+  FReader.OnEndElement := @ReaderEndElement;
+  FDocument := AFragmentRoot.OwnerDocument;
+  FElementStack := TList.Create;
+  FNodeBuffer := TList.Create;
+  FragmentRoot := AFragmentRoot;
+  IsFragmentMode := True;
+end;
+
+destructor TXMLToDOMConverter.Destroy;
+var
+  i: Integer;
+begin
+  // Theoretically, always exactly one item will remain - the root element:
+  for i := 0 to FNodeBuffer.Count - 1 do
+    TXMLNodeInfo(FNodeBuffer[i]).Free;
+  FNodeBuffer.Free;
+
+  FElementStack.Free;
+  inherited Destroy;
+end;
+
+procedure TXMLToDOMConverter.ReaderCharacters(Sender: TObject;
+  const ch: PSAXChar; Start, Count: Integer);
+var
+  s: SAXString;
+  NodeInfo: TXMLNodeInfo;
+begin
+  SetLength(s, Count);
+  Move(ch^, s[1], Count * SizeOf(SAXChar));
+
+  NodeInfo := TXMLNodeInfo.Create;
+  NodeInfo.NodeType := ntText;
+  NodeInfo.DOMNode := FDocument.CreateTextNode(s);
+  FNodeBuffer.Add(NodeInfo);
+end;
+
+procedure TXMLToDOMConverter.ReaderIgnorableWhitespace(Sender: TObject;
+  const ch: PSAXChar; Start, Count: Integer);
+var
+  s: SAXString;
+  NodeInfo: TXMLNodeInfo;
+begin
+  SetLength(s, Count);
+  Move(ch^, s[1], Count * SizeOf(SAXChar));
+
+  NodeInfo := TXMLNodeInfo.Create;
+  NodeInfo.NodeType := ntWhitespace;
+  NodeInfo.DOMNode := FDocument.CreateTextNode(s);
+  FNodeBuffer.Add(NodeInfo);
+end;
+
+procedure TXMLToDOMConverter.ReaderSkippedEntity(Sender: TObject;
+  const Name: SAXString);
+var
+  NodeInfo: TXMLNodeInfo;
+begin
+  NodeInfo := TXMLNodeInfo.Create;
+  NodeInfo.NodeType := ntEntityReference;
+  NodeInfo.DOMNode := FDocument.CreateEntityReference(Name);
+  FNodeBuffer.Add(NodeInfo);
+end;
+
+procedure TXMLToDOMConverter.ReaderStartElement(Sender: TObject;
+  const NamespaceURI, LocalName, RawName: SAXString; Attr: TSAXAttributes);
+var
+  NodeInfo: TXMLNodeInfo;
+  Element: TDOMElement;
+  i: Integer;
+begin
+  // WriteLn('Start: ', LocalName, '. Node buffer before: ', FNodeBuffer.Count, ' elements');
+  Element := FDocument.CreateElement(LocalName);
+  if Assigned(Attr) then
+  begin
+    // WriteLn('Attribute: ', Attr.GetLength);
+    for i := 0 to Attr.GetLength - 1 do
+    begin
+      // WriteLn('#', i, ': LocalName = ', Attr.GetLocalName(i), ', Value = ', Attr.GetValue(i));
+      Element[Attr.GetLocalName(i)] := Attr.GetValue(i);
+    end;
+  end;
+
+  NodeInfo := TXMLNodeInfo.Create;
+  NodeInfo.NodeType := ntTag;
+  NodeInfo.DOMNode := Element;
+  if IsFragmentMode then
+  begin
+    if not FragmentRootSet then
+    begin
+      FragmentRoot.AppendChild(Element);
+      FragmentRootSet := True;
+    end;
+  end else
+    if not Assigned(FDocument.DocumentElement) then
+      FDocument.AppendChild(Element);
+  FNodeBuffer.Add(NodeInfo);
+  // WriteLn('Start: ', LocalName, '. Node buffer after: ', FNodeBuffer.Count, ' elements');
+end;
+
+procedure TXMLToDOMConverter.ReaderEndElement(Sender: TObject;
+  const NamespaceURI, LocalName, RawName: SAXString);
+var
+  NodeInfo, NodeInfo2: TXMLNodeInfo;
+  i : Integer;
+
+begin
+  // WriteLn('End: ', LocalName, '. Node buffer: ', FNodeBuffer.Count, ' elements');
+  // Find the matching start tag
+  i := FNodeBuffer.Count - 1;
+  while i >= 0 do
+  begin
+    NodeInfo := TXMLNodeInfo(FNodeBuffer.Items[i]);
+    if (NodeInfo.NodeType = ntTag) and
+      (CompareText(NodeInfo.DOMNode.NodeName, LocalName) = 0) then
+    begin
+      // We found the matching start tag
+
+      Inc(i);
+      while i < FNodeBuffer.Count do
+      begin
+        NodeInfo2 := TXMLNodeInfo(FNodeBuffer.Items[i]);
+        NodeInfo.DOMNode.AppendChild(NodeInfo2.DOMNode);
+        NodeInfo2.Free;
+        FNodeBuffer.Delete(i);
+      end;
+      break;
+    end;
+    Dec(i);
+  end;
+end;
+
+
+procedure ReadXMLFile(var ADoc: TXMLDocument; const AFilename: String);
+var
+  f: TStream;
+begin
+  ADoc := nil;
+  f := TFileStream.Create(AFilename, fmOpenRead);
+  try
+    ReadXMLFile(ADoc, f);
+  finally
+    f.Free;
+  end;
+end;
+
+procedure ReadXMLFile(var ADoc: TXMLDocument; var f: TStream);
+var
+  Reader: TSAXXMLReader;
+  Converter: TXMLToDOMConverter;
+begin
+  ADoc := TXMLDocument.Create;
+  Reader := TSAXXMLReader.Create;
+  try
+    Converter := TXMLToDOMConverter.Create(Reader, ADoc);
+    try
+      Reader.ParseStream(f);
+    finally
+      Converter.Free;
+    end;
+  finally
+    Reader.Free;
+  end;
+end;
+
+procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String);
+var
+  f: TStream;
+begin
+  f := TFileStream.Create(AFilename, fmOpenRead);
+  try
+    ReadXMLFragment(AParentNode, f);
+  finally
+    f.Free;
+  end;
+end;
+
+procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream);
+var
+  Reader: TSAXXMLReader;
+  Converter: TXMLToDOMConverter;
+begin
+  Reader := TSAXXMLReader.Create;
+  try
+    Converter := TXMLToDOMConverter.CreateFragment(Reader, AParentNode);
+    try
+      Reader.ParseStream(f);
+    finally
+      Converter.Free;
+    end;
+  finally
+    Reader.Free;
+  end;
+end;
+
+
+end.