// -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- // // System.Xml.XmlInputSource.cs // port of Open Xml TXmlInputSource class // // Author: // Daniel Weber (daniel-weber@austin.rr.com // // (C) 2001 Daniel Weber // // using System; using System.IO; namespace System.Xml { internal class XmlInputSource { private DomEncodingType Fencoding; private string FpublicID; private string FsystemID; public string FrootName; private Stream FStream; private bool FLastCharWasCR; // locator int FColumnNumber; int FStartColumnNumber; int FStartLineNumber; bool FLastWCharWasLF; int FLineNumber; bool FPieceEndSet; // Buffer storage for UTF-8 surrogates // see http://www.ietf.org/rfc/rfc2279.txt for a complete description of UTF-8 encoding private int FLastUcs4; //FLocator: TdomStandardLocator; // public properties //=========================================================================== public DomEncodingType encoding { get { return Fencoding; } } //property locator: TdomStandardLocator read FLocator; public string publicId { get { return FpublicID; } } public string rootName { get { return FrootName; } } public Stream stream { get { return FStream; } } public string streamAsWideString { get { return string.Empty; //wideString read getStreamAsWideString; } } public string systemId { get { return FsystemID; } } public int columnNumber { get { return FColumnNumber; } } public int lineNumber { get { return FLineNumber; } } public int startColumnNumber { get { return FStartColumnNumber; } } public int startLineNumber { get { return FStartLineNumber; } } // private methods //=========================================================================== /// /// Analyze the first bytes of an XML document to try and determine encoding /// /// Determined encoding type, defaults to UTF-8 private void setEncodingType() { try { byte[] buf = new byte[4]; FStream.Seek(0, SeekOrigin.Begin); FStream.Read(buf, 0, 4); // UTF-16 code streams should begin with 0xfeff for big-endian systems // or 0xfffe for little endian systems. // check that first.... if ( (buf[0] == 0xfe) & (buf[1] == 0xff) ) Fencoding = DomEncodingType.etUTF16BE; else if ( (buf[0] == 0xff) & (buf[1] == 0xfe) ) Fencoding = DomEncodingType.etUTF16LE; else { // assume utf-8, look for encoding in tag, eg Fencoding = DomEncodingType.etUTF8; // Check if the stream begins with ') break; } // start from the location of "encoding", and scan for quotes string encodeString = readEncodingAttrFromTag(tag); encodeString = encodeString.ToUpper(); if ( (encodeString.IndexOf("ISO-8859-1") != -1) | (encodeString.IndexOf("LATIN-1") != -1) ) Fencoding = DomEncodingType.etLatin1; else if ( (encodeString.IndexOf("ISO-8859-2") != -1) | (encodeString.IndexOf("LATIN-2") != -1) ) Fencoding = DomEncodingType.etLatin2; else if ( (encodeString.IndexOf("ISO-8859-3") != -1) | (encodeString.IndexOf("LATIN-3") != -1) ) Fencoding = DomEncodingType.etLatin3; else if ( (encodeString.IndexOf("ISO-8859-4") != -1) | (encodeString.IndexOf("LATIN-4") != -1) ) Fencoding = DomEncodingType.etLatin4; else if ( (encodeString.IndexOf("ISO-8859-5") != -1) | (encodeString.IndexOf("CYRILLIC") != -1) ) Fencoding = DomEncodingType.etCyrillic; else if ( (encodeString.IndexOf("ISO-8859-6") != -1) | (encodeString.IndexOf("ARABIC") != -1) ) Fencoding = DomEncodingType.etArabic; else if ( (encodeString.IndexOf("ISO-8859-7") != -1) | (encodeString.IndexOf("GREEK") != -1) ) Fencoding = DomEncodingType.etGreek; else if ( (encodeString.IndexOf("ISO-8859-8") != -1) | (encodeString.IndexOf("HEBREW") != -1) ) Fencoding = DomEncodingType.etHebrew; else if ( (encodeString.IndexOf("ISO-8859-9") != -1) | (encodeString.IndexOf("LATIN-5") != -1) ) Fencoding = DomEncodingType.etLatin5; else if ( (encodeString.IndexOf("ISO-8859-10") != -1) | (encodeString.IndexOf("LATIN-6") != -1) ) Fencoding = DomEncodingType.etLatin6; else if ( (encodeString.IndexOf("ISO-8859-13") != -1) | (encodeString.IndexOf("LATIN-7") != -1) ) Fencoding = DomEncodingType.etLatin7; else if ( (encodeString.IndexOf("ISO-8859-14") != -1) | (encodeString.IndexOf("LATIN-8") != -1) ) Fencoding = DomEncodingType.etLatin8; else if ( (encodeString.IndexOf("ISO-8859-15") != -1) | (encodeString.IndexOf("LATIN-9") != -1) ) Fencoding = DomEncodingType.etLatin9; else if (encodeString.IndexOf("KOI8-R") != -1) Fencoding = DomEncodingType.etKOI8R; else if (encodeString.IndexOf("CP10000_MACROMAN") != -1) Fencoding = DomEncodingType.etcp10000_MacRoman; else if ( (encodeString.IndexOf("Windows-1250") != -1) | (encodeString.IndexOf("CP1250") != -1) ) Fencoding = DomEncodingType.etcp1250; else if ( (encodeString.IndexOf("Windows-1251") != -1) | (encodeString.IndexOf("CP1251") != -1) ) Fencoding = DomEncodingType.etcp1251; else if ( (encodeString.IndexOf("Windows-1252") != -1) | (encodeString.IndexOf("CP1252") != -1) ) Fencoding = DomEncodingType.etcp1252; } } } catch { Fencoding = DomEncodingType.etUTF8; } FStream.Seek(0, SeekOrigin.Begin); } /// /// Helper function to try and find the encoding attribute value in /// declaration tag. Does not do well-formedness checks. /// /// string to scan /// If bad encoding char found, mis-matched quotes, or no equals sign. /// encoding, or string.Empty if it is not found. private string readEncodingAttrFromTag( string tag ) { int encodeIndex = tag.IndexOf("encoding"); if ( encodeIndex == -1) return string.Empty; else { int curIndex = encodeIndex + "encoding".Length; bool firstQuoteFound = false; bool equalsFound = false; char quoteChar = (char) 0xffff; // c# insists on initialization... string encoding = ""; while ( curIndex != tag.Length ) { char c = tag[curIndex]; curIndex++; if ( c == '=') { equalsFound = true; continue; } if ( (c== '\"') | (c=='\'') ) { if ( !firstQuoteFound & !equalsFound) throw new InvalidOperationException("No equals sign found in encoding attribute"); else if ( firstQuoteFound ) { if (c == quoteChar) return encoding; else throw new InvalidOperationException("non-matching quotes in attribute value"); } else { firstQuoteFound = true; quoteChar = c; continue; } } else if (firstQuoteFound) { if ( ( c >= 'a') & ( c <= 'z')) encoding += c; else if ( ( c >= 'A') & ( c <= 'Z')) encoding += c; else if ( ( c >= '0') & ( c <= '9')) encoding += c; else if ( c == '_' ) encoding += c; else if ( c == '-') encoding += c; else if (c == '.') encoding += c; else throw new InvalidOperationException("invalid character in encoding attribute"); } } return string.Empty; } } private void evaluate(char c) { if (FLastWCharWasLF) { FLineNumber++; FLastWCharWasLF = false; FColumnNumber = 1; } else FColumnNumber++; if (c == (char) 10 ) FLastWCharWasLF = true; if (FPieceEndSet) pieceStart(); } public void pieceEnd() { FPieceEndSet = true; } public void pieceStart() { FStartColumnNumber = FColumnNumber; FStartLineNumber = FLineNumber; FPieceEndSet = false; } /// /// Return true if input stream is at EOF. /// /// public bool atEOF() { return (FStream.Length == FStream.Position); } /// /// Sets the internal root name by analyzing the tags at the beginning of the stream. /// root name is: /// - the element tag of the first element found /// - the root name listed in a !DOCTYPE tag /// - empty if a parse error occurs, or no applicable tags are found. /// Does not do well-formedness checks - skips comments and proc. instructions /// private void getRootName() { reset(); FrootName = string.Empty; while ( ! atEOF() ) { string tag = "<"; char c = getNextChar(); // skip whitespace to first tag while ( !atEOF() && (XmlNames_1_0.IsXmlWhiteSpace( c )) ) c = getNextChar(); if ( (c != '<') | atEOF() ) break; while ( !atEOF() & ( c != '>' ) ) { c = getNextChar(); tag += c; } if ( atEOF() ) break; // Only allow 1) comments, 2) processing instructions before if ( tag.StartsWith("') && (doctypeTag[start] != '[') && (doctypeTag[start] != '/') ) tmp += doctypeTag[start]; if (XmlNames_1_0.isXmlName(tmp) ) FrootName = tmp; } /// /// Read in the next character (either UTF-8 or UTF-16) and convert by charset /// Normalize CR/LF pairs to single CR. /// /// public char getNextChar() { byte[] buf = new byte[2]; char retval = (char) 0xffff; int bCount; switch(Fencoding) { case DomEncodingType.etLatin1: bCount = stream.Read(buf,0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_1ToUTF16Char(buf[0]); break; case DomEncodingType.etLatin2: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_2ToUTF16Char(buf[0]); break; case DomEncodingType.etLatin3: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_3ToUTF16Char(buf[0]); break; case DomEncodingType.etLatin4: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_4ToUTF16Char(buf[0]); break; case DomEncodingType.etCyrillic: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_5ToUTF16Char(buf[0]); break; case DomEncodingType.etArabic: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_6ToUTF16Char(buf[0]); break; case DomEncodingType.etGreek: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_7ToUTF16Char(buf[0]); break; case DomEncodingType.etHebrew: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_8ToUTF16Char(buf[0]); break; case DomEncodingType.etLatin5: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_9ToUTF16Char(buf[0]); break; case DomEncodingType.etLatin6: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_10ToUTF16Char(buf[0]); break; case DomEncodingType.etLatin7: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_13ToUTF16Char(buf[0]); break; case DomEncodingType.etLatin8: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_14ToUTF16Char(buf[0]); break; case DomEncodingType.etLatin9: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.Iso8859_15ToUTF16Char(buf[0]); break; case DomEncodingType.etKOI8R: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.KOI8_RToUTF16Char(buf[0]); break; case DomEncodingType.etcp10000_MacRoman: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.cp10000_MacRomanToUTF16Char(buf[0]); break; case DomEncodingType.etcp1250: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.cp1250ToUTF16Char(buf[0]); break; case DomEncodingType.etcp1251: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.cp1251ToUTF16Char(buf[0]); break; case DomEncodingType.etcp1252: bCount = stream.Read(buf, 0, 1); if (bCount == 1) retval = XmlUtil.cp1252ToUTF16Char(buf[0]); break; case DomEncodingType.etUTF8: if ( FLastUcs4 >= 0x10000) { // Output low surrogate retval = XmlUtil.Utf16LowSurrogate(FLastUcs4); FLastUcs4 = 0; } else { FLastUcs4 = XmlUtil.ReadUTF8Char( stream ); if ( FLastUcs4 >= 0x10000) retval = XmlUtil.Utf16HighSurrogate(FLastUcs4); else retval = (char) FLastUcs4; } break; case DomEncodingType.etUTF16BE: bCount = stream.Read(buf, 0, 2); if (bCount == 2) retval = System.Convert.ToChar( (buf[0] << 16) + buf[1] ); break; case DomEncodingType.etUTF16LE: bCount = stream.Read(buf, 0, 2); if (bCount == 2) retval = System.Convert.ToChar( (buf[1] << 16) + buf[0] ); break; } // normalize CRLF or a single CR to LF: if ( (retval == 0x000D) & FLastCharWasCR) // 0x000d = CR { FLastCharWasCR = false; return getNextChar(); } else if ( retval == 0x000A) // 0x000a = LF { FLastCharWasCR = true; return (char) 0x000D; } else FLastCharWasCR = false; evaluate(retval); return retval; } /// /// Reset the Input to the origin and clear internal variables. /// public void reset() { FLastUcs4 = 0; FLastCharWasCR = false; switch(Fencoding) { // skip the leading 0xfeff/oxfffe on UTF-16 streams case DomEncodingType.etUTF16BE: FStream.Seek(2, SeekOrigin.Begin); break; case DomEncodingType.etUTF16LE: FStream.Seek(2, SeekOrigin.Begin); break; default: FStream.Seek(0, SeekOrigin.Begin); break; } FColumnNumber = 0; FLineNumber = 0; FStartColumnNumber = 0; FStartLineNumber = 0; FLastWCharWasLF = true; pieceEnd(); } /* * private protected function getStreamAsWideString: wideString; virtual; procedure skipTextDecl(const locator: TdomStandardLocator); virtual; public constructor create(const stream: TStream; const publicId, systemId: wideString); virtual; destructor destroy; override; */ // Constructor //=========================================================================== XmlInputSource(Stream inputStream, string publicID, string systemID) { if (inputStream == null) throw new NullReferenceException("Null stream passed to XmlInputSource constructor"); FStream = inputStream; FLastUcs4 = 0; FLastCharWasCR = false; FpublicID = publicID; FsystemID = systemID; setEncodingType(); //FLocator:= TdomStandardLocator.create(self); getRootName(); } } }