// HtmlAgilityPack V1.0 - Simon Mourier using System; using System.IO; using System.Text; using System.Collections; namespace HtmlAgilityPack { /// /// Represents the type of fragement in a mixed code document. /// public enum MixedCodeDocumentFragmentType { /// /// The fragment contains code. /// Code, /// /// The fragment contains text. /// Text, } /// /// Represents a fragment of code in a mixed code document. /// public class MixedCodeDocumentCodeFragment: MixedCodeDocumentFragment { internal string _code; internal MixedCodeDocumentCodeFragment(MixedCodeDocument doc): base(doc, MixedCodeDocumentFragmentType.Code) { } /// /// Gets the fragment code text. /// public string Code { get { if (_code == null) { _code = FragmentText.Substring(_doc.TokenCodeStart.Length, FragmentText.Length - _doc.TokenCodeEnd.Length - _doc.TokenCodeStart.Length -1).Trim(); if (_code.StartsWith("=")) { _code = _doc.TokenResponseWrite + _code.Substring(1, _code.Length-1); } } return _code; } set { _code = value; } } } /// /// Represents a fragment of text in a mixed code document. /// public class MixedCodeDocumentTextFragment: MixedCodeDocumentFragment { internal MixedCodeDocumentTextFragment(MixedCodeDocument doc): base(doc, MixedCodeDocumentFragmentType.Text) { } /// /// Gets the fragment text. /// public string Text { get { return FragmentText; } set { base._fragmenttext = value; } } } /// /// Represents a base class for fragments in a mixed code document. /// public abstract class MixedCodeDocumentFragment { internal MixedCodeDocumentFragmentType _type; internal MixedCodeDocument _doc; internal int _index; internal int _length; internal int _line; internal int _lineposition; internal string _fragmenttext; internal MixedCodeDocumentFragment(MixedCodeDocument doc, MixedCodeDocumentFragmentType type) { _doc = doc; _type = type; switch(type) { case MixedCodeDocumentFragmentType.Text: _doc._textfragments.Append(this); break; case MixedCodeDocumentFragmentType.Code: _doc._codefragments.Append(this); break; } _doc._fragments.Append(this); } /// /// Gets the type of fragment. /// public MixedCodeDocumentFragmentType FragmentType { get { return _type; } } /// /// Gets the fragment position in the document's stream. /// public int StreamPosition { get { return _index; } } /// /// Gets the line number of the fragment. /// public int Line { get { return _line; } } /// /// Gets the line position (column) of the fragment. /// public int LinePosition { get { return _lineposition; } } /// /// Gets the fragement text. /// public string FragmentText { get { if (_fragmenttext == null) { _fragmenttext = _doc._text.Substring(_index, _length); } return _fragmenttext; } } } /// /// Represents a list of mixed code fragments. /// public class MixedCodeDocumentFragmentList: IEnumerable { private MixedCodeDocument _doc; private ArrayList _items = new ArrayList(); internal MixedCodeDocumentFragmentList(MixedCodeDocument doc) { _doc = doc; } /// /// Appends a fragment to the list of fragments. /// /// The fragment to append. May not be null. public void Append(MixedCodeDocumentFragment newFragment) { if (newFragment == null) { throw new ArgumentNullException("newFragment"); } _items.Add(newFragment); } /// /// Prepends a fragment to the list of fragments. /// /// The fragment to append. May not be null. public void Prepend(MixedCodeDocumentFragment newFragment) { if (newFragment == null) { throw new ArgumentNullException("newFragment"); } _items.Insert(0, newFragment); } /// /// Remove a fragment from the list of fragments. If this fragment was not in the list, an exception will be raised. /// /// The fragment to remove. May not be null. public void Remove(MixedCodeDocumentFragment fragment) { if (fragment == null) { throw new ArgumentNullException("fragment"); } int index = GetFragmentIndex(fragment); if (index == -1) { throw new IndexOutOfRangeException(); } RemoveAt(index); } /// /// Remove a fragment from the list of fragments, using its index in the list. /// /// The index of the fragment to remove. public void RemoveAt(int index) { MixedCodeDocumentFragment frag = (MixedCodeDocumentFragment)_items[index]; _items.RemoveAt(index); } /// /// Remove all fragments from the list. /// public void RemoveAll() { _items.Clear(); } /// /// Gets the number of fragments contained in the list. /// public int Count { get { return _items.Count; } } internal int GetFragmentIndex(MixedCodeDocumentFragment fragment) { if (fragment == null) { throw new ArgumentNullException("fragment"); } for(int i=0;i<_items.Count;i++) { if (((MixedCodeDocumentFragment)_items[i])==fragment) { return i; } } return -1; } /// /// Gets a fragment from the list using its index. /// public MixedCodeDocumentFragment this[int index] { get { return _items[index] as MixedCodeDocumentFragment; } } internal void Clear() { _items.Clear(); } /// /// Gets an enumerator that can iterate through the fragment list. /// public MixedCodeDocumentFragmentEnumerator GetEnumerator() { return new MixedCodeDocumentFragmentEnumerator(_items); } /// /// Gets an enumerator that can iterate through the fragment list. /// IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); } /// /// Represents a fragment enumerator. /// public class MixedCodeDocumentFragmentEnumerator: IEnumerator { int _index; ArrayList _items; internal MixedCodeDocumentFragmentEnumerator(ArrayList items) { _items = items; _index = -1; } /// /// Sets the enumerator to its initial position, which is before the first element in the collection. /// public void Reset() { _index = -1; } /// /// Advances the enumerator to the next element of the collection. /// /// true if the enumerator was successfully advanced to the next element; false if the enumerator has passed the end of the collection. public bool MoveNext() { _index++; return (_index<_items.Count); } /// /// Gets the current element in the collection. /// public MixedCodeDocumentFragment Current { get { return (MixedCodeDocumentFragment)(_items[_index]); } } /// /// Gets the current element in the collection. /// object IEnumerator.Current { get { return (Current); } } } } /// /// Represents a document with mixed code and text. ASP, ASPX, JSP, are good example of such documents. /// public class MixedCodeDocument { private System.Text.Encoding _streamencoding = null; internal string _text; internal MixedCodeDocumentFragmentList _fragments; internal MixedCodeDocumentFragmentList _codefragments; internal MixedCodeDocumentFragmentList _textfragments; private ParseState _state; private int _index; private int _c; private int _line; private int _lineposition; private MixedCodeDocumentFragment _currentfragment; /// /// Gets or sets the token representing code start. /// public string TokenCodeStart = "<%"; /// /// Gets or sets the token representing code end. /// public string TokenCodeEnd = "%>"; /// /// Gets or sets the token representing code directive. /// public string TokenDirective = "@"; /// /// Gets or sets the token representing response write directive. /// public string TokenResponseWrite = "Response.Write "; private string TokenTextBlock = "TextBlock({0})"; /// /// Creates a mixed code document instance. /// public MixedCodeDocument() { _codefragments = new MixedCodeDocumentFragmentList(this); _textfragments = new MixedCodeDocumentFragmentList(this); _fragments = new MixedCodeDocumentFragmentList(this); } /// /// Loads a mixed code document from a stream. /// /// The input stream. public void Load(Stream stream) { Load(new StreamReader(stream)); } /// /// Loads a mixed code document from a stream. /// /// The input stream. /// Indicates whether to look for byte order marks at the beginning of the file. public void Load(Stream stream, bool detectEncodingFromByteOrderMarks) { Load(new StreamReader(stream, detectEncodingFromByteOrderMarks)); } /// /// Loads a mixed code document from a stream. /// /// The input stream. /// The character encoding to use. public void Load(Stream stream, Encoding encoding) { Load(new StreamReader(stream, encoding)); } /// /// Loads a mixed code document from a stream. /// /// The input stream. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the file. public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks) { Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks)); } /// /// Loads a mixed code document from a stream. /// /// The input stream. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the file. /// The minimum buffer size. public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize) { Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize)); } /// /// Loads a mixed code document from a file. /// /// The complete file path to be read. public void Load(string path) { Load(new StreamReader(path)); } /// /// Loads a mixed code document from a file. /// /// The complete file path to be read. /// Indicates whether to look for byte order marks at the beginning of the file. public void Load(string path, bool detectEncodingFromByteOrderMarks) { Load(new StreamReader(path, detectEncodingFromByteOrderMarks)); } /// /// Loads a mixed code document from a file. /// /// The complete file path to be read. /// The character encoding to use. public void Load(string path, Encoding encoding) { Load(new StreamReader(path, encoding)); } /// /// Loads a mixed code document from a file. /// /// The complete file path to be read. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the file. public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks) { Load(new StreamReader(path, encoding, detectEncodingFromByteOrderMarks)); } /// /// Loads a mixed code document from a file. /// /// The complete file path to be read. /// The character encoding to use. /// Indicates whether to look for byte order marks at the beginning of the file. /// The minimum buffer size. public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize) { Load(new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize)); } /// /// Loads a mixed document from a text /// /// The text to load. public void LoadHtml(string html) { Load(new StringReader(html)); } /// /// Loads the mixed code document from the specified TextReader. /// /// The TextReader used to feed the HTML data into the document. public void Load(TextReader reader) { _codefragments.Clear(); _textfragments.Clear(); // all pseudo constructors get down to this one StreamReader sr = reader as StreamReader; if (sr != null) { _streamencoding = sr.CurrentEncoding; } _text = reader.ReadToEnd(); reader.Close(); Parse(); } internal System.Text.Encoding GetOutEncoding() { if (_streamencoding != null) return _streamencoding; return System.Text.Encoding.Default; } /// /// Gets the encoding of the stream used to read the document. /// public System.Text.Encoding StreamEncoding { get { return _streamencoding; } } /// /// Gets the list of code fragments in the document. /// public MixedCodeDocumentFragmentList CodeFragments { get { return _codefragments; } } /// /// Gets the list of text fragments in the document. /// public MixedCodeDocumentFragmentList TextFragments { get { return _textfragments; } } /// /// Gets the list of all fragments in the document. /// public MixedCodeDocumentFragmentList Fragments { get { return _fragments; } } /// /// Saves the mixed document to the specified stream. /// /// The stream to which you want to save. public void Save(Stream outStream) { StreamWriter sw = new StreamWriter(outStream, GetOutEncoding()); Save(sw); } /// /// Saves the mixed document to the specified stream. /// /// The stream to which you want to save. /// The character encoding to use. public void Save(Stream outStream, System.Text.Encoding encoding) { StreamWriter sw = new StreamWriter(outStream, encoding); Save(sw); } /// /// Saves the mixed document to the specified file. /// /// The location of the file where you want to save the document. public void Save(string filename) { StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding()); Save(sw); } /// /// Saves the mixed document to the specified file. /// /// The location of the file where you want to save the document. /// The character encoding to use. public void Save(string filename, System.Text.Encoding encoding) { StreamWriter sw = new StreamWriter(filename, false, encoding); Save(sw); } /// /// Saves the mixed document to the specified StreamWriter. /// /// The StreamWriter to which you want to save. public void Save(StreamWriter writer) { Save((TextWriter)writer); } /// /// Saves the mixed document to the specified TextWriter. /// /// The TextWriter to which you want to save. public void Save(TextWriter writer) { writer.Flush(); } /// /// Gets the code represented by the mixed code document seen as a template. /// public string Code { get { string s = ""; int i = 0; foreach(MixedCodeDocumentFragment frag in _fragments) { switch(frag._type) { case MixedCodeDocumentFragmentType.Text: s += TokenResponseWrite + string.Format(TokenTextBlock, i) + "\n"; i++; break; case MixedCodeDocumentFragmentType.Code: s += ((MixedCodeDocumentCodeFragment)frag).Code + "\n"; break; } } return s; } } /// /// Create a text fragment instances. /// /// The newly created text fragment instance. public MixedCodeDocumentTextFragment CreateTextFragment() { return (MixedCodeDocumentTextFragment)CreateFragment(MixedCodeDocumentFragmentType.Text); } /// /// Create a code fragment instances. /// /// The newly created code fragment instance. public MixedCodeDocumentCodeFragment CreateCodeFragment() { return (MixedCodeDocumentCodeFragment)CreateFragment(MixedCodeDocumentFragmentType.Code); } internal MixedCodeDocumentFragment CreateFragment(MixedCodeDocumentFragmentType type) { switch(type) { case MixedCodeDocumentFragmentType.Text: return new MixedCodeDocumentTextFragment(this); case MixedCodeDocumentFragmentType.Code: return new MixedCodeDocumentCodeFragment(this); default: throw new NotSupportedException(); } } private void SetPosition() { _currentfragment._line = _line; _currentfragment._lineposition = _lineposition; _currentfragment._index = _index - 1; _currentfragment._length = 0; } private void IncrementPosition() { _index++; if (_c == 10) { _lineposition = 1; _line++; } else _lineposition++; } private enum ParseState { Text, Code } private void Parse() { _state = ParseState.Text; _index = 0; _currentfragment = CreateFragment(MixedCodeDocumentFragmentType.Text); while (_index<_text.Length) { _c = _text[_index]; IncrementPosition(); switch(_state) { case ParseState.Text: if (_index+TokenCodeStart.Length<_text.Length) { if (_text.Substring(_index-1, TokenCodeStart.Length) == TokenCodeStart) { _state = ParseState.Code; _currentfragment._length = _index -1 - _currentfragment._index; _currentfragment = CreateFragment(MixedCodeDocumentFragmentType.Code); SetPosition(); continue; } } break; case ParseState.Code: if (_index+TokenCodeEnd.Length<_text.Length) { if (_text.Substring(_index-1, TokenCodeEnd.Length) == TokenCodeEnd) { _state = ParseState.Text; _currentfragment._length = _index + TokenCodeEnd.Length - _currentfragment._index; _index += TokenCodeEnd.Length; _lineposition += TokenCodeEnd.Length; _currentfragment = CreateFragment(MixedCodeDocumentFragmentType.Text); SetPosition(); continue; } } break; } } _currentfragment._length = _index - _currentfragment._index; } } }