{ This file is part of the Free Component Library XML reading routines. Copyright (c) 1999-2000 by Sebastian Guenther, sg@freepascal.org Modified in 2006 by Sergei Gorelkin, sergei_gorelkin@mail.ru See the file COPYING.FPC, included in this distribution, for details about the copyright. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. **********************************************************************} unit XMLRead; {$ifdef fpc} {$MODE objfpc}{$H+} {$endif} interface {off $DEFINE MEM_CHECK} uses {$IFDEF MEM_CHECK}MemCheck,{$ENDIF} SysUtils, Classes, DOM; type EXMLReadError = class(Exception); procedure ReadXMLFile(out ADoc: TXMLDocument; const AFilename: String); overload; procedure ReadXMLFile(out ADoc: TXMLDocument; var f: File); overload; procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream); overload; procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream; const AFilename: String); overload; procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String); overload; procedure ReadXMLFragment(AParentNode: TDOMNode; var f: File); overload; procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream); overload; procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream; const AFilename: String); overload; procedure ReadDTDFile(out ADoc: TXMLDocument; const AFilename: String); overload; procedure ReadDTDFile(out ADoc: TXMLDocument; var f: File); overload; procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream); overload; procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream; const AFilename: String); overload; // ======================================================= implementation type TSetOfChar = set of Char; const Letter = ['A'..'Z', 'a'..'z']; Digit = ['0'..'9']; PubidChars: TSetOfChar = [' ', #13, #10, 'a'..'z', 'A'..'Z', '0'..'9', '-', '''', '(', ')', '+', ',', '.', '/', ':', '=', '?', ';', '!', '*', '#', '@', '$', '_', '%']; WhitespaceChars: TSetOfChar = [#9, #10, #13, ' ']; NmToken: TSetOfChar = Letter + Digit + ['.', '-', '_', ':']; type TXMLReaderDocument = class(TXMLDocument) public procedure SetDocType(ADocType: TDOMDocumentType); end; TXMLReaderDocumentType = class(TDOMDocumentType) public constructor Create(ADocument: TXMLReaderDocument); property Name: DOMString read FNodeName write FNodeName; end; { supported encodings } TEncoding = (enUnknown, enUTF8, enUTF16BE, enUTF16LE); TXMLReader = class private FCurChar: WideChar; FLine: Integer; FColumn: Integer; FSeenCR: Boolean; FEncoding: TEncoding; FValue: array of WideChar; FValueLength: Integer; FPrologParsed: Boolean; procedure RaiseExpectedQmark; procedure InternalGetChar; function GetChar: WideChar; procedure AppendValue(wc: WideChar); procedure DetectEncoding; protected buf: PChar; Filename: String; procedure RaiseExc(const descr: String); overload; procedure RaiseExc(Expected, Found: WideChar); overload; function SkipWhitespace: Boolean; procedure ExpectWhitespace; procedure ExpectString(const s: String); procedure ExpectChar(wc: WideChar); function CheckForChar(c: WideChar): Boolean; procedure SkipString(const ValidChars: TSetOfChar); function GetString(const ValidChars: TSetOfChar): WideString; procedure RaiseNameNotFound; function CheckName: Boolean; function ExpectName: WideString; // [5] procedure SkipName; procedure ExpectAttValue(attr: TDOMAttr); // [10] procedure SkipPubidLiteral; // [12] procedure ParseComment(AOwner: TDOMNode); // [15] procedure ParsePI; // [16] procedure ExpectProlog; // [22] procedure ParseProlog; function ParseEq: Boolean; // [25] procedure ExpectEq; procedure ParseMisc(AOwner: TDOMNode); // [27] function ParseMarkupDecl: Boolean; // [29] procedure ParseCharData(AOwner: TDOMNode); // [14] procedure ParseCDSect(AOwner: TDOMNode); // [18] function ParseElementContent(AOwner: TDOMNode): Boolean; procedure ParseElement(AOwner: TDOMNode); // [39] procedure ExpectElement(AOwner: TDOMNode); procedure ParseReference(AOwner: TDOMNode); // [67] function ParsePEReference: Boolean; // [69] function ParseExternalID: Boolean; // [75] procedure ExpectExternalID; procedure SkipEncodingDecl; // [80] procedure ParseEntityDecl; procedure ParseAttlistDecl; procedure ParseElementDecl; procedure ParseNotationDecl; procedure ResolveEntities(RootNode: TDOMNode); public doc: TDOMDocument; procedure ProcessXML(ABuf: PChar; const AFilename: String); // [1] procedure ProcessFragment(AOwner: TDOMNode; ABuf: PChar; const AFilename: String); procedure ProcessDTD(ABuf: PChar; const AFilename: String); // ([29]) end; { TXMLReaderDocument } procedure TXMLReaderDocument.SetDocType(ADocType: TDOMDocumentType); begin FDocType := ADocType; end; constructor TXMLReaderDocumentType.Create(ADocument: TXMLReaderDocument); begin inherited Create(ADocument); end; // TODO: this and others must use table approach for speed-up function IsNameStartChar(wc: WideChar): Boolean; // [4] begin case wc of // (note) excludes single $D7, $F7, $37E, ':', 'A'..'Z', '_', 'a'..'z', #$C0..#$D6, #$D8..#$F6, #$F8..#$2FF, #$370..#$37D, #$37F..#$1FFF, #$200C, #$200D, #$2070..#$218F, #$2C00..#$2FEF, #$3001..#$D7FF, #$F900..#$FDCF, #$FDF0..#$FFFD: Result := True; else Result := False; end; end; function IsNameChar(wc: WideChar): Boolean; // [4a] begin Result := IsNameStartChar(wc) or ((wc = '-') or (wc = '.') or ((wc >= '0') and (wc <= '9')) or (wc = #$B7) or ((wc >= #$300) and (wc <= #$36F)) or (wc = #$203F) or (wc = #$2040)); end; function IsWhitespace(wc: WideChar): Boolean; begin Result := (wc = ' ') or (wc = #10) or (wc = #13) or (wc = #9); end; { TXMLReader } procedure TXMLReader.DetectEncoding; var w: Word; Enc: TEncoding; function CheckByte(value: Byte): Boolean; var cb: Byte; begin cb := ord(buf[0]); Inc(buf); Result := (cb = value); end; function CheckWord(value: Word): Boolean; var cw: Word; begin cw := PWord(buf)^; Inc(buf, sizeof(Word)); {$IFDEF ENDIAN_BIG} Swap(cw); {$ENDIF} // TODO: Is that correct? Result := (cw = value); end; begin Enc := enUnknown; w := PWord(Buf)^; Inc(Buf, sizeof(Word)); {$IFDEF ENDIAN_BIG} Swap(w); {$ENDIF} // TODO: Is that correct? // case of no BOM if (w = (ord('?') shl 8 + ord('<'))) { $3F3C } then Enc := enUTF8 // not known, in fact, just a default else if (w = ord('<')) and CheckWord(ord('?')) then Enc := enUTF16LE else if (w = ord('<') shl 8) and CheckWord(ord('?') shl 8) then Enc := enUTF16BE; if Enc <> enUnknown then // any of above tests succeeded, must start from '?' begin FEncoding := Enc; FCurChar := '?'; Inc(FColumn); Exit; end; if w = $FFFE then FEncoding := enUTF16BE else if w = $FEFF then FEncoding := enUTF16LE else if (w = $BBEF) and CheckByte($BF) then FEncoding := enUTF8; GetChar; end; procedure TXMLReader.InternalGetChar; var ch, ch2, ch3: Byte; procedure BadChar; begin RaiseExc('Invalid character in UTF8 sequence'); end; begin if FEncoding in [enUnknown, enUTF8] then begin ch := ord(buf[0]); Inc(Buf); end else begin // Endianness: no swapping here; see below FCurChar := PWideChar(Buf)^; Inc(Buf, sizeof(WideChar)); end; case FEncoding of enUnknown: FCurChar := WideChar(Ch); enUTF8: if Ch < 128 then { ASCII } FCurChar := WideChar(Ch) else if (Ch and $E0) = $C0 then { #$0080 - #$07FF } begin ch2 := ord(buf[0]); Inc(Buf); if (Ch2 and $C0) <> $80 then BadChar; FCurChar := WideChar((Ch and $1F) shl 6 + (Ch2 and $3F)); end else if (Ch and $F0) = $E0 then { #$0800 - #$FFFF } begin ch2 := ord(buf[0]); Inc(buf); if (Ch2 and $C0) <> $80 then BadChar; ch3 := ord(buf[0]); Inc(buf); if (Ch3 and $C0) <> $80 then BadChar; FCurChar := WideChar(Word((Ch and $0F) shl 12) + (Ch2 and $3F) shl 6 + (Ch3 and $3F)); end else RaiseExc('Unsupported UTF8 character'); {$IFDEF ENDIAN_BIG} enUTF16LE: {$ELSE} enUTF16BE: {$ENDIF} FCurChar := WideChar((Ord(FCurChar) and $FF) shl 8 + (Ord(FCurChar) shr 8)); end; end; function TXMLReader.GetChar: WideChar; begin InternalGetChar; if FSeenCR then begin case FCurChar of #10, #$85: InternalGetChar; // #$85 is xml 1.1 specific end; FSeenCR := False; end; if FCurChar = #13 then begin FSeenCR := True; FCurChar := #10; end else // xml 1.1 specific check if (FCurChar = #$85) or (FCurChar = #$2028) then FCurChar := #10; if FCurChar = #10 then begin Inc(FLine); FColumn := 0; end else Inc(FColumn); Result := FCurChar; end; procedure TXMLReader.AppendValue(wc: WideChar); var Alloc: Integer; begin Alloc := Length(FValue); if FValueLength >= Alloc then begin if Alloc = 0 then Alloc := 512 else Alloc := Alloc * 2; SetLength(FValue, Alloc); end; FValue[FValueLength] := wc; Inc(FValueLength); end; procedure TXMLReader.RaiseExpectedQmark; begin RaiseExc('Expected single or double quotation mark'); end; procedure TXMLReader.RaiseExc(Expected, Found: WideChar); begin RaiseExc('Expected "' + Expected + '", but found "' + Found + '",'); end; procedure TXMLReader.RaiseExc(const descr: String); begin raise EXMLReadError.CreateFmt('In %s (line %d pos %d): %s', [Filename, FLine, FColumn, descr]); end; function TXMLReader.SkipWhitespace: Boolean; begin Result := False; while IsWhitespace(FCurChar) do begin GetChar; Result := True; end; end; procedure TXMLReader.ExpectWhitespace; begin if not SkipWhitespace then RaiseExc('Expected whitespace'); end; procedure TXMLReader.ExpectChar(wc: WideChar); begin if not CheckForChar(wc) then RaiseExc(wc, FCurChar); end; procedure TXMLReader.ExpectString(const s: String); procedure RaiseStringNotFound; begin RaiseExc('Expected "' + s + '"'); end; var I: Integer; begin for I := 1 to Length(s) do begin if FCurChar <> WideChar(s[i]) then RaiseStringNotFound; GetChar; end; end; function TXMLReader.CheckForChar(c: WideChar): Boolean; begin Result := (FCurChar = c); if Result then GetChar; end; procedure TXMLReader.SkipString(const ValidChars: TSetOfChar); begin FValueLength := 0; while (ord(FCurChar) < 256) and (char(FCurChar) in ValidChars) do begin AppendValue(FCurChar); GetChar; end; end; function TXMLReader.GetString(const ValidChars: TSetOfChar): WideString; begin SkipString(ValidChars); SetString(Result, PWideChar(@FValue[0]), FValueLength); end; procedure TXMLReader.ProcessXML(ABuf: PChar; const AFilename: String); // [1] begin buf := ABuf; Filename := AFilename; FLine := 1; FColumn := 0; doc := TXMLReaderDocument.Create; DetectEncoding; ExpectProlog; {$IFDEF MEM_CHECK}CheckHeapWrtMemCnt('TXMLReader.ProcessXML A');{$ENDIF} ExpectElement(doc); {$IFDEF MEM_CHECK}CheckHeapWrtMemCnt('TXMLReader.ProcessXML B');{$ENDIF} ParseMisc(doc); if FCurChar <> #0 then RaiseExc('Text after end of document element found'); end; procedure TXMLReader.ProcessFragment(AOwner: TDOMNode; ABuf: PChar; const AFilename: String); begin buf := ABuf; Filename := AFilename; FLine := 1; FColumn := 0; FEncoding := enUTF8; // TODO: Detect it? Not sure for now... GetChar; if not ParseElementContent(AOwner) then ; end; function TXMLReader.CheckName: Boolean; // [5] begin Result := IsNameStartChar(FCurChar); if Result then begin FValueLength := 0; repeat AppendValue(FCurChar); GetChar; until (FCurChar = #0) or not IsNameChar(FCurChar); end; end; procedure TXMLReader.RaiseNameNotFound; begin RaiseExc('Expected letter, "_" or ":" for name, found "' + FCurChar + '"'); end; function TXMLReader.ExpectName: WideString; // [5] begin if not CheckName then RaiseNameNotFound; SetString(Result, PWideChar(@FValue[0]), FValueLength); end; procedure TXMLReader.SkipName; begin if not CheckName then RaiseNameNotFound; end; // --------------------- procedure TXMLReader.ExpectAttValue(attr: TDOMAttr); // [10] procedure FlushStringBuffer; var s: WideString; begin if FValueLength > 0 then begin SetString(s, PWideChar(@FValue[0]),FValueLength); FValueLength := 0; attr.AppendChild(doc.CreateTextNode(s)); //SetLength(s, 0); // cleared implicitly end; end; var Delim: WideChar; begin if (FCurChar <> '''') and (FCurChar <> '"') then RaiseExpectedQmark; Delim := FCurChar; GetChar; // skip quote FValueLength := 0; while (FCurChar <> Delim) and (FCurChar <> #0) do begin if FCurChar <> '&' then begin AppendValue(FCurChar); GetChar; end else begin if FValueLength > 0 then FlushStringBuffer; ParseReference(attr); FValueLength := 0; end; end; if FValueLength > 0 then FlushStringBuffer; GetChar; // skip trailing quote ResolveEntities(Attr); end; procedure TXMLReader.SkipPubidLiteral; // [12] var Delim: WideChar; begin if (FCurChar = '''') or (FCurChar = '"') then begin Delim := FCurChar; GetChar; // skip quote SkipString(PubidChars - [Char(Delim)]); // <-- PubidChars do not contain `"` ExpectChar(Delim); end else RaiseExpectedQMark; end; // starting '= 3) and (FValue[FValueLength-1] = '>') and (FValue[FValueLength-2] = '-') and (FValue[FValueLength-3] = '-') then begin Dec(FValueLength, 3); Break; end; until FCurChar = #0; // should not happen SetString(comment, PWideChar(@FValue[0]), FValueLength); AOwner.AppendChild(doc.CreateComment(comment)); end; // starting '?' contained in FCurChar procedure TXMLReader.ParsePI; // [16] begin GetChar; // skip '?' SkipName; // ugly but uses no temp string. Need StrLIComp(PWideChar, PWideChar). if (FValueLength = 3) and ((FValue[0] = 'X') or (FValue[0] = 'x')) and ((FValue[1] = 'M') or (FValue[1] = 'm')) and ((FValue[2] = 'L') or (FValue[2] = 'l')) then begin if not FPrologParsed then begin ParseProlog; FPrologParsed := True; Exit; end else RaiseExc('"= 2) and (FValue[FValueLength-1] = '>') and (FValue[FValueLength-2] = '?') then begin Dec(FValueLength, 2); Break; end; until FCurChar = #0; // should not happen end; end; // here we come from ParsePI, 'xml' is already consumed procedure TXMLReader.ParseProlog; var Delim: WideChar; begin // '' // VersionInfo: S 'version' Eq (' VersionNum ' | " VersionNum ") SkipWhitespace; ExpectString('version'); ExpectEq; if (FCurChar = '''') or (FCurChar = '"') then begin Delim := FCurChar; GetChar; // skip quote if doc.InheritsFrom(TXMLDocument) then TXMLDocument(doc).XMLVersion := GetString(NmToken); {['a'..'z', 'A'..'Z', '0'..'9', '_', '.', ':', '-']} ExpectChar(Delim); end else RaiseExpectedQMark; // EncodingDecl? SkipEncodingDecl; // SDDecl? SkipWhitespace; if CheckForChar('s') then begin ExpectString('tandalone'); ExpectEq; if (FCurChar = '''') or (FCurChar = '"') then begin Delim := FCurChar; GetChar; // skip quote ExpectName; // TODO: must check for 'yes' or 'no' ExpectChar(Delim); end else RaiseExpectedQMark; SkipWhitespace; end; ExpectString('?>'); end; procedure TXMLReader.ExpectProlog; // [22] var DocType: TXMLReaderDocumentType; begin FPrologParsed := False; // The special case when first chars had been consumed by DetectEncoding() if FCurChar = '?' then ParsePI; // Check for "Misc*" ParseMisc(doc); // Check for "(doctypedecl Misc*)?" [28] if CheckForChar('D') then begin ExpectString('OCTYPE'); // create the DTD object DocType := TXMLReaderDocumentType.Create(doc as TXMLReaderDocument); if doc.InheritsFrom(TXMLReaderDocument) then TXMLReaderDocument(doc).SetDocType(DocType); SkipWhitespace; DocType.Name := ExpectName; SkipWhitespace; ParseExternalID; // may be absent, ignore result SkipWhitespace; if CheckForChar('[') then begin repeat SkipWhitespace; until not (ParseMarkupDecl or ParsePEReference); ExpectChar(']'); SkipWhitespace; ExpectChar('>'); end; ParseMisc(doc); end; end; function TXMLReader.ParseEq: Boolean; // [25] begin SkipWhitespace; Result := CheckForChar('='); if Result then SkipWhitespace; end; procedure TXMLReader.ExpectEq; begin if not ParseEq then RaiseExc('Expected "="'); end; // Parse "Misc*": // Misc ::= Comment | PI | S procedure TXMLReader.ParseMisc(AOwner: TDOMNode); // [27] begin repeat SkipWhitespace; if FCurChar <> '<' then Break; GetChar; if FCurChar = '!' then begin GetChar; if FCurChar = '-' then ParseComment(AOwner) else Break; end else if FCurChar = '?' then ParsePI else Break; until False; end; { DTD stuff } procedure TXMLReader.ParseElementDecl; // [45] procedure ExpectChoiceOrSeq; // [49], [50] procedure ExpectCP; // [48] begin if CheckForChar('(') then ExpectChoiceOrSeq else SkipName; if CheckForChar('?') then else if CheckForChar('*') then else if CheckForChar('+') then; end; var delimiter: WideChar; begin SkipWhitespace; ExpectCP; SkipWhitespace; delimiter := #0; repeat if (FCurChar = #0) or CheckForChar(')') then Break; if delimiter = #0 then begin if (FCurChar = '|') or (FCurChar = ',') then delimiter := FCurChar else RaiseExc('Expected "|" or ","'); end else if FCurChar <> delimiter then RaiseExc(delimiter, FCurChar); GetChar; // skip delimiter SkipWhitespace; ExpectCP; until False; end; begin // starting ''); end; // starting ''); end; // starting '') do begin SkipName; ExpectWhitespace; // Get AttType [54], [55], [56] // TODO: possibly extract all letters and compare with list... if FCurChar = 'C' then ExpectString('CDATA') else if CheckForChar('I') then // ID, IDREF, IDREFS begin ExpectChar('D'); if FCurChar = 'R' then begin ExpectString('REF'); CheckForChar('S'); end; end else if FCurChar = 'E' then begin ExpectString('ENTIT'); if not CheckForChar('Y') then ExpectString('IES'); end else if CheckForChar('N') then begin if FCurChar = 'M' then begin ExpectString('TOKEN'); CheckForChar('S'); end else if FCurChar = 'O' then // [57], [58] begin ExpectString('OTATION'); ExpectWhitespace; ExpectChar('('); SkipWhitespace; SkipName; SkipWhitespace; while not CheckForChar(')') do begin ExpectChar('|'); SkipWhitespace; SkipName; SkipWhitespace; end; end; end else if CheckForChar('(') then begin // [59] SkipWhitespace; SkipString(Nmtoken); SkipWhitespace; while not CheckForChar(')') do begin ExpectChar('|'); SkipWhitespace; SkipString(Nmtoken); SkipWhitespace; end; end else RaiseExc('Invalid tokenized type'); ExpectWhitespace; // Get DefaultDecl [60] ValueRequired := False; if CheckForChar('#') then begin if FCurChar = 'R' then ExpectString('REQUIRED') else if FCurChar = 'I' then ExpectString('IMPLIED') else if FCurChar = 'F' then begin ExpectString('FIXED'); SkipWhitespace; ValueRequired := True; end; end else ValueRequired := True; if ValueRequired then begin attr := doc.CreateAttribute(''); ExpectAttValue(attr); end; SkipWhitespace; end; end; // starting ''); end; function TXMLReader.ParseMarkupDecl: Boolean; // [29] begin Result := False; repeat SkipWhitespace; if FCurChar <> '<' then // condition is true for #0 Exit; GetChar; if FCurChar = '!' then begin GetChar; if FCurChar = 'E' then // either ELEMENT or ENTITY begin GetChar; if FCurChar = 'L' then ParseElementDecl else if FCurChar = 'N' then ParseEntityDecl; end else if FCurChar = 'A' then // ATTLIST ParseAttlistDecl else if FCurChar = 'N' then // NOTATION ParseNotationDecl else if FCurChar = '-' then ParseComment(Doc); end else if FCurChar = '?' then ParsePI; until False; end; procedure TXMLReader.ProcessDTD(ABuf: PChar; const AFilename: String); begin buf := ABuf; Filename := AFilename; FLine := 1; FColumn := 0; FEncoding := enUTF8; // TODO: Detect? Don't know for sure now... GetChar; doc := TXMLReaderDocument.Create; ParseMarkupDecl; // TODO: PEReferences? { if buf[0] <> #0 then begin DebugLn('=== Unparsed: ==='); //DebugLn(buf); DebugLn(StrLen(buf), ' chars'); end; } end; procedure TXMLReader.ParseCharData(AOwner: TDOMNode); // [14] var nonWs: Boolean; name: WideString; begin FValueLength := 0; nonWs := False; while not ((FCurChar = #0) or (FCurChar = '<') or (FCurChar = '&')) do begin if not IsWhitespace(FCurChar) then nonWs := True; AppendValue(FCurChar); GetChar; end; if nonWs then begin SetString(name, PWideChar(@FValue[0]), FValueLength); AOwner.AppendChild(doc.CreateTextNode(name)); end; end; // starting '= 3) and (FValue[FValueLength-1] = '>') and (FValue[FValueLength-2] = ']') and (FValue[FValueLength-3] = ']') then begin Dec(FValueLength, 3); Break; end; until FCurChar = #0; SetString(name, PWideChar(@FValue[0]), FValueLength); AOwner.AppendChild(doc.CreateCDATASection(name)); end; { returns True at end of stream. this is ok for fragments but error for document returns False when '<' is followed by ([^![?] | NameStartChar) this is ok for document (expect ETag then) but error for fragment } function TXMLReader.ParseElementContent(AOwner: TDOMNode): Boolean; begin Result := False; repeat SkipWhitespace; if FCurChar = '<' then begin GetChar; if FCurChar = '!' then begin GetChar; if FCurChar = '[' then ParseCDSect(AOwner) else if FCurChar = '-' then ParseComment(AOwner); end else if FCurChar = '?' then ParsePI else if CheckName then ParseElement(AOwner) else Exit; end else if FCurChar = '&' then ParseReference(AOwner) else ParseCharData(AOwner); until FCurChar = #0; Result := True; end; // Element name already in FValueBuffer procedure TXMLReader.ParseElement(AOwner: TDOMNode); // [39] [40] [44] var NewElem: TDOMElement; IsEmpty: Boolean; attr: TDOMAttr; name: WideString; begin {$IFDEF MEM_CHECK}CheckHeapWrtMemCnt(' ParseElement A');{$ENDIF} SetString(name, PWideChar(@FValue[0]), FValueLength); NewElem := doc.CreateElement(name); AOwner.AppendChild(NewElem); SkipWhitespace; IsEmpty := False; while not CheckForChar('>') do begin {$IFDEF MEM_CHECK}CheckHeapWrtMemCnt(' ParseElement E');{$ENDIF} if CheckForChar('/') then begin ExpectChar('>'); // '>' should follow, otherwise it's fatal error IsEmpty := True; Break; end; // Get Attribute [41] attr := doc.CreateAttribute(ExpectName); NewElem.Attributes.SetNamedItem(attr); ExpectEq; ExpectAttValue(attr); SkipWhitespace; end; if not IsEmpty then begin if not ParseElementContent(NewElem) then begin if CheckForChar('/') then // Get ETag [42] begin if ExpectName <> NewElem.NodeName then RaiseExc('Unmatching element end tag (expected "")'); SkipWhitespace; ExpectChar('>'); end else RaiseExc('Invalid name start character'); end; end; {$IFDEF MEM_CHECK}CheckHeapWrtMemCnt(' ParseElement END');{$ENDIF} ResolveEntities(NewElem); end; procedure TXMLReader.ExpectElement(AOwner: TDOMNode); begin if CheckName then ParseElement(AOwner) else RaiseExc('Expected element'); end; function TXMLReader.ParsePEReference: Boolean; // [69] begin Result := CheckForChar('%'); if Result then begin SkipName; ExpectChar(';'); end; end; // FCurChar = '&' here procedure TXMLReader.ParseReference(AOwner: TDOMNode); // [67] [68] var StrBuf: array[0..31] of char; StrLength: Integer; s: string; Value: Integer; PrevNode: TDomNode; procedure AppendChar(c: WideChar); begin if StrLength < High(StrBuf) then begin StrBuf[StrLength] := char(c); Inc(StrLength); end; GetChar; end; begin GetChar; // skip '&' if CheckForChar('#') then begin // Test for CharRef [66] StrLength := 0; if CheckForChar('x') then begin AppendChar('$'); while ((ord(FCurChar) < 256) and (char(FCurChar) in ['0'..'9', 'a'..'f', 'A'..'F'])) do AppendChar(FCurChar); end else while ((ord(FCurChar) < 256) and (char(FCurChar) in ['0'..'9'])) do AppendChar(FCurChar); // TODO: get rid of temp string here SetString(s, StrBuf, StrLength); // This will handle case of no digits present Value := StrToIntDef(s, -1); if (Value < 0) or (Value > $FFFF) then RaiseExc('Invalid character reference') else begin PrevNode := AOwner.LastChild; // TODO: partial solution, check other similar cases if Assigned(PrevNode) and (PrevNode.NodeType = TEXT_NODE) then TDomCharacterData(PrevNode).AppendData(WideChar(Value)) else AOwner.AppendChild(doc.CreateTextNode(WideChar(Value))); end; end else AOwner.AppendChild(doc.CreateEntityReference(ExpectName)); ExpectChar(';'); end; function TXMLReader.ParseExternalID: Boolean; // [75] procedure SkipSystemLiteral; var Delim: WideChar; begin if (FCurChar = '''') or (FCurChar = '"') then begin Delim := FCurChar; GetChar; // skip quote while (FCurChar <> Delim) and (FCurChar <> #0) do begin GetChar; end; ExpectChar(Delim); // <-- to check the EOF only end; end; begin if FCurChar = 'S' then begin ExpectString('SYSTEM'); ExpectWhitespace; SkipSystemLiteral; Result := True; end else if FCurChar = 'P' then begin ExpectString('PUBLIC'); ExpectWhitespace; SkipPubidLiteral; ExpectWhitespace; SkipSystemLiteral; Result := True; end else Result := False; end; procedure TXMLReader.ExpectExternalID; begin if not ParseExternalID then RaiseExc('Expected external ID'); end; procedure TXMLReader.SkipEncodingDecl; // [80] var Delim: WideChar; begin SkipWhitespace; if CheckForChar('e') then begin ExpectString('ncoding'); ExpectEq; if (FCurChar = '''') or (FCurChar = '"') then begin Delim := FCurChar; GetChar; // skip quote if not ((ord(FCurChar) < 256) and (char(FCurChar) in ['A'..'Z', 'a'..'z'])) then RaiseExc('Expected character (A-Z, a-z)'); SkipString(['A'..'Z', 'a'..'z', '0'..'9', '.', '_', '-']); ExpectChar(Delim); end; end; end; { Currently, this method will only resolve the entities which are predefined in XML: } procedure TXMLReader.ResolveEntities(RootNode: TDOMNode); var Node, NextNode: TDOMNode; procedure ReplaceEntityRef(EntityNode: TDOMNode; const Replacement: WideString); var PrevSibling, NextSibling: TDOMNode; begin PrevSibling := EntityNode.PreviousSibling; NextSibling := EntityNode.NextSibling; if Assigned(PrevSibling) and (PrevSibling.NodeType = TEXT_NODE) then begin TDOMCharacterData(PrevSibling).AppendData(Replacement); RootNode.RemoveChild(EntityNode); if Assigned(NextSibling) and (NextSibling.NodeType = TEXT_NODE) then begin // next sibling is to be removed, so we can't use it anymore NextNode := NextSibling.NextSibling; TDOMCharacterData(PrevSibling).AppendData( TDOMCharacterData(NextSibling).Data); RootNode.RemoveChild(NextSibling); end end else if Assigned(NextSibling) and (NextSibling.NodeType = TEXT_NODE) then begin TDOMCharacterData(NextSibling).InsertData(0, Replacement); RootNode.RemoveChild(EntityNode); end else RootNode.ReplaceChild(Doc.CreateTextNode(Replacement), EntityNode); end; begin Node := RootNode.FirstChild; while Assigned(Node) do begin NextNode := Node.NextSibling; if Node.NodeType = ENTITY_REFERENCE_NODE then if Node.NodeName = 'amp' then ReplaceEntityRef(Node, '&') else if Node.NodeName = 'apos' then ReplaceEntityRef(Node, '''') else if Node.NodeName = 'gt' then ReplaceEntityRef(Node, '>') else if Node.NodeName = 'lt' then ReplaceEntityRef(Node, '<') else if Node.NodeName = 'quot' then ReplaceEntityRef(Node, '"'); Node := NextNode; end; end; procedure ReadXMLFile(out ADoc: TXMLDocument; var f: File); var reader: TXMLReader; buf: PChar; BufSize: LongInt; begin ADoc := nil; BufSize := FileSize(f) + 2; // need double termination for the case of Unicode if BufSize <= 2 then exit; GetMem(buf, BufSize); try BlockRead(f, buf^, BufSize - 2); buf[BufSize - 1] := #0; buf[BufSize] := #0; Reader := TXMLReader.Create; try Reader.ProcessXML(buf, TFileRec(f).name); ADoc := TXMLDocument(Reader.doc); finally Reader.Free; end; finally FreeMem(buf); end; end; procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream; const AFilename: String); var reader: TXMLReader; buf: PChar; StreamSize: Int64; begin ADoc := nil; StreamSize := f.Size; // access to Size causes at least two seeks... if StreamSize = 0 then exit; GetMem(buf, StreamSize + 2); try f.Read(buf^, StreamSize); buf[StreamSize] := #0; buf[StreamSize+1] := #0; Reader := TXMLReader.Create; try Reader.ProcessXML(buf, AFilename); finally ADoc := TXMLDocument(Reader.doc); Reader.Free; end; finally FreeMem(buf); end; end; procedure ReadXMLFile(out ADoc: TXMLDocument; var f: TStream); begin ReadXMLFile(ADoc, f, ''); end; procedure ReadXMLFile(out ADoc: TXMLDocument; const AFilename: String); var FileStream: TStream; begin ADoc := nil; FileStream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite); if FileStream = nil then exit; //? it throws exception if cannot be created... try ReadXMLFile(ADoc, FileStream, AFilename); finally FileStream.Free; end; end; procedure ReadXMLFragment(AParentNode: TDOMNode; var f: File); var Reader: TXMLReader; buf: PChar; BufSize: LongInt; begin BufSize := FileSize(f) + 2; if BufSize <= 2 then exit; GetMem(buf, BufSize); try BlockRead(f, buf^, BufSize - 2); buf[BufSize - 1] := #0; buf[BufSize] := #0; Reader := TXMLReader.Create; try Reader.Doc := AParentNode.OwnerDocument; Reader.ProcessFragment(AParentNode, buf, TFileRec(f).name); finally Reader.Free; end; finally FreeMem(buf); end; end; procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream; const AFilename: String); var Reader: TXMLReader; buf: PChar; StreamSize: Int64; begin StreamSize := f.Size; if StreamSize = 0 then exit; GetMem(buf, StreamSize + 2); try f.Read(buf^, StreamSize); buf[StreamSize] := #0; buf[StreamSize+1] := #0; Reader := TXMLReader.Create; Reader.Doc := AParentNode.OwnerDocument; try Reader.ProcessFragment(AParentNode, buf, AFilename); finally Reader.Free; end; finally FreeMem(buf); end; end; procedure ReadXMLFragment(AParentNode: TDOMNode; var f: TStream); begin ReadXMLFragment(AParentNode, f, ''); end; procedure ReadXMLFragment(AParentNode: TDOMNode; const AFilename: String); var Stream: TStream; begin Stream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite); try ReadXMLFragment(AParentNode, Stream, AFilename); finally Stream.Free; end; end; procedure ReadDTDFile(out ADoc: TXMLDocument; var f: File); var Reader: TXMLReader; buf: PChar; BufSize: LongInt; begin ADoc := nil; BufSize := FileSize(f) + 1; if BufSize <= 1 then exit; GetMem(buf, BufSize); try BlockRead(f, buf^, BufSize - 1); buf[BufSize - 1] := #0; Reader := TXMLReader.Create; try Reader.ProcessDTD(buf, TFileRec(f).name); ADoc := TXMLDocument(Reader.doc); finally Reader.Free; end; finally FreeMem(buf); end; end; procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream; const AFilename: String); var Reader: TXMLReader; buf: PChar; begin ADoc := nil; if f.Size = 0 then exit; GetMem(buf, f.Size + 1); try f.Read(buf^, f.Size); buf[f.Size] := #0; Reader := TXMLReader.Create; try Reader.ProcessDTD(buf, AFilename); ADoc := TXMLDocument(Reader.doc); finally Reader.Free; end; finally FreeMem(buf); end; end; procedure ReadDTDFile(out ADoc: TXMLDocument; var f: TStream); begin ReadDTDFile(ADoc, f, ''); end; procedure ReadDTDFile(out ADoc: TXMLDocument; const AFilename: String); var Stream: TStream; begin ADoc := nil; Stream := TFileStream.Create(AFilename, fmOpenRead+fmShareDenyWrite); try ReadDTDFile(ADoc, Stream, AFilename); finally Stream.Free; end; end; end.