// HtmlAgilityPack V1.0 - Simon Mourier using System; using System.Collections; using System.IO; using System.Xml; using System.Xml.XPath; namespace HtmlAgilityPack { /// /// Flags that describe the behavior of an Element node. /// public enum HtmlElementFlag { /// /// The node is a CDATA node. /// CData = 1, /// /// The node is empty. META or IMG are example of such nodes. /// Empty = 2, /// /// The node will automatically be closed during parsing. /// Closed = 4, /// /// The node can overlap. /// CanOverlap = 8 } /// /// Represents the type of a node. /// public enum HtmlNodeType { /// /// The root of a document. /// Document, /// /// An HTML element. /// Element, /// /// An HTML comment. /// Comment, /// /// A text node is always the child of an element or a document node. /// Text, } /// /// Represents an HTML node. /// public class HtmlNode: IXPathNavigable { /// /// Gets the name of a comment node. It is actually defined as '#comment'. /// public static readonly string HtmlNodeTypeNameComment = "#comment"; /// /// Gets the name of the document node. It is actually defined as '#document'. /// public static readonly string HtmlNodeTypeNameDocument = "#document"; /// /// Gets the name of a text node. It is actually defined as '#text'. /// public static readonly string HtmlNodeTypeNameText = "#text"; /// /// Gets a collection of flags that define specific behaviors for specific element nodes. /// The table contains a DictionaryEntry list with the lowercase tag name as the Key, and a combination of HtmlElementFlags as the Value. /// public static Hashtable ElementsFlags; internal HtmlNodeType _nodetype; internal HtmlNode _nextnode; internal HtmlNode _prevnode; internal HtmlNode _parentnode; internal HtmlDocument _ownerdocument; internal HtmlNodeCollection _childnodes; internal HtmlAttributeCollection _attributes; internal int _line = 0; internal int _lineposition = 0; internal int _streamposition = 0; internal int _innerstartindex = 0; internal int _innerlength = 0; internal int _outerstartindex = 0; internal int _outerlength = 0; internal int _namestartindex = 0; internal int _namelength = 0; internal bool _starttag = false; internal string _name; internal HtmlNode _prevwithsamename = null; internal HtmlNode _endnode; internal bool _innerchanged = false; internal bool _outerchanged = false; internal string _innerhtml; internal string _outerhtml; static HtmlNode() { // tags whose content may be anything ElementsFlags = new Hashtable(); ElementsFlags.Add("script", HtmlElementFlag.CData); ElementsFlags.Add("style", HtmlElementFlag.CData); ElementsFlags.Add("noxhtml", HtmlElementFlag.CData); // tags that can not contain other tags ElementsFlags.Add("base", HtmlElementFlag.Empty); ElementsFlags.Add("link", HtmlElementFlag.Empty); ElementsFlags.Add("meta", HtmlElementFlag.Empty); ElementsFlags.Add("isindex", HtmlElementFlag.Empty); ElementsFlags.Add("hr", HtmlElementFlag.Empty); ElementsFlags.Add("col", HtmlElementFlag.Empty); ElementsFlags.Add("img", HtmlElementFlag.Empty); ElementsFlags.Add("param", HtmlElementFlag.Empty); ElementsFlags.Add("embed", HtmlElementFlag.Empty); ElementsFlags.Add("frame", HtmlElementFlag.Empty); ElementsFlags.Add("wbr", HtmlElementFlag.Empty); ElementsFlags.Add("bgsound", HtmlElementFlag.Empty); ElementsFlags.Add("spacer", HtmlElementFlag.Empty); ElementsFlags.Add("keygen", HtmlElementFlag.Empty); ElementsFlags.Add("area", HtmlElementFlag.Empty); ElementsFlags.Add("input", HtmlElementFlag.Empty); ElementsFlags.Add("basefont", HtmlElementFlag.Empty); ElementsFlags.Add("form", HtmlElementFlag.CanOverlap | HtmlElementFlag.Empty); // they sometimes contain, and sometimes they don 't... ElementsFlags.Add("option", HtmlElementFlag.Empty); // tag whose closing tag is equivalent to open tag: //

bla

bla will be transformed into

bla

bla //

bla

bla will be transformed into

bla

bla and not

bla>

bla

or

bla

bla

//
see above ElementsFlags.Add("br", HtmlElementFlag.Empty | HtmlElementFlag.Closed); ElementsFlags.Add("p", HtmlElementFlag.Empty | HtmlElementFlag.Closed); } /// /// Determines if an element node is closed. /// /// The name of the element node to check. May not be null. /// true if the name is the name of a closed element node, false otherwise. public static bool IsClosedElement(string name) { if (name == null) { throw new ArgumentNullException("name"); } object flag = ElementsFlags[name.ToLower()]; if (flag == null) { return false; } return (((HtmlElementFlag)flag)&HtmlElementFlag.Closed) != 0; } /// /// Determines if an element node can be kept overlapped. /// /// The name of the element node to check. May not be null. /// true if the name is the name of an element node that can be kept overlapped, false otherwise. public static bool CanOverlapElement(string name) { if (name == null) { throw new ArgumentNullException("name"); } object flag = ElementsFlags[name.ToLower()]; if (flag == null) { return false; } return (((HtmlElementFlag)flag)&HtmlElementFlag.CanOverlap) != 0; } /// /// Determines if a text corresponds to the closing tag of an node that can be kept overlapped. /// /// The text to check. May not be null. /// true or false. public static bool IsOverlappedClosingElement(string text) { if (text == null) { throw new ArgumentNullException("text"); } // min is : 4 if (text.Length <= 4) return false; if ((text[0] != '<') || (text[text.Length - 1] != '>') || (text[1] != '/')) return false; string name = text.Substring(2, text.Length - 3); return CanOverlapElement(name); } /// /// Determines if an element node is a CDATA element node. /// /// The name of the element node to check. May not be null. /// true if the name is the name of a CDATA element node, false otherwise. public static bool IsCDataElement(string name) { if (name == null) { throw new ArgumentNullException("name"); } object flag = ElementsFlags[name.ToLower()]; if (flag == null) { return false; } return (((HtmlElementFlag)flag)&HtmlElementFlag.CData) != 0; } /// /// Determines if an element node is defined as empty. /// /// The name of the element node to check. May not be null. /// true if the name is the name of an empty element node, false otherwise. public static bool IsEmptyElement(string name) { if (name == null) { throw new ArgumentNullException("name"); } if (name.Length == 0) { return true; } // /// Creates an HTML node from a string representing literal HTML. /// /// The HTML text. /// The newly created node instance. public static HtmlNode CreateNode(string html) { // REVIEW: this is *not* optimum... HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); return doc.DocumentNode.FirstChild; } /// /// Creates a duplicate of the node and the subtree under it. /// /// The node to duplicate. May not be null. public void CopyFrom(HtmlNode node) { CopyFrom(node, true); } /// /// Creates a duplicate of the node. /// /// The node to duplicate. May not be null. /// true to recursively clone the subtree under the specified node, false to clone only the node itself. public void CopyFrom(HtmlNode node, bool deep) { if (node == null) { throw new ArgumentNullException("node"); } Attributes.RemoveAll(); if (node.HasAttributes) { foreach(HtmlAttribute att in node.Attributes) { SetAttributeValue(att.Name, att.Value); } } if (!deep) { RemoveAllChildren(); if (node.HasChildNodes) { foreach(HtmlNode child in node.ChildNodes) { AppendChild(child.CloneNode(true)); } } } } internal HtmlNode(HtmlNodeType type, HtmlDocument ownerdocument, int index) { _nodetype = type; _ownerdocument = ownerdocument; _outerstartindex = index; switch(type) { case HtmlNodeType.Comment: _name = HtmlNodeTypeNameComment; _endnode = this; break; case HtmlNodeType.Document: _name = HtmlNodeTypeNameDocument; _endnode = this; break; case HtmlNodeType.Text: _name = HtmlNodeTypeNameText; _endnode = this; break; } if (_ownerdocument._openednodes != null) { if (!Closed) { // we use the index as the key // -1 means the node comes from public if (-1 != index) { _ownerdocument._openednodes.Add(index, this); } } } if ((-1 == index) && (type != HtmlNodeType.Comment) && (type != HtmlNodeType.Text)) { // innerhtml and outerhtml must be calculated _outerchanged = true; _innerchanged = true; } } internal void CloseNode(HtmlNode endnode) { if (!_ownerdocument.OptionAutoCloseOnEnd) { // close all children if (_childnodes != null) { foreach(HtmlNode child in _childnodes) { if (child.Closed) continue; // create a fake closer node HtmlNode close = new HtmlNode(NodeType, _ownerdocument, -1); close._endnode = close; child.CloseNode(close); } } } if (!Closed) { _endnode = endnode; if (_ownerdocument._openednodes != null) { _ownerdocument._openednodes.Remove(_outerstartindex); } HtmlNode self = _ownerdocument._lastnodes[Name] as HtmlNode; if (self == this) { _ownerdocument._lastnodes.Remove(Name); _ownerdocument.UpdateLastParentNode(); } if (endnode == this) return; // create an inner section _innerstartindex = _outerstartindex + _outerlength; _innerlength = endnode._outerstartindex - _innerstartindex; // update full length _outerlength = (endnode._outerstartindex + endnode._outerlength) - _outerstartindex; } } internal HtmlNode EndNode { get { return _endnode; } } internal string GetId() { HtmlAttribute att = Attributes["id"]; if (att == null) { return null; } return att.Value; } internal void SetId(string id) { HtmlAttribute att = Attributes["id"]; if (att == null) { att = _ownerdocument.CreateAttribute("id"); } att.Value = id; _ownerdocument.SetIdForNode(this, att.Value); _outerchanged = true; } /// /// Creates a new XPathNavigator object for navigating this HTML node. /// /// An XPathNavigator object. The XPathNavigator is positioned on the node from which the method was called. It is not positioned on the root of the document. public XPathNavigator CreateNavigator() { return new HtmlNodeNavigator(_ownerdocument, this); } /// /// Selects the first XmlNode that matches the XPath expression. /// /// The XPath expression. May not be null. /// The first HtmlNode that matches the XPath query or a null reference if no matching node was found. public HtmlNode SelectSingleNode(string xpath) { if (xpath == null) { throw new ArgumentNullException("xpath"); } HtmlNodeNavigator nav = new HtmlNodeNavigator(_ownerdocument, this); XPathNodeIterator it = nav.Select(xpath); if (!it.MoveNext()) { return null; } HtmlNodeNavigator node = (HtmlNodeNavigator)it.Current; return node.CurrentNode; } /// /// Selects a list of nodes matching the XPath expression. /// /// The XPath expression. /// An HtmlNodeCollection containing a collection of nodes matching the XPath query, or null if no node matched the XPath expression. public HtmlNodeCollection SelectNodes(string xpath) { HtmlNodeCollection list = new HtmlNodeCollection(null); HtmlNodeNavigator nav = new HtmlNodeNavigator(_ownerdocument, this); XPathNodeIterator it = nav.Select(xpath); while (it.MoveNext()) { HtmlNodeNavigator n = (HtmlNodeNavigator)it.Current; list.Add(n.CurrentNode); } if (list.Count == 0) { return null; } return list; } /// /// Gets or sets the value of the 'id' HTML attribute. The document must have been parsed using the OptionUseIdAttribute set to true. /// public string Id { get { if (_ownerdocument._nodesid == null) { throw new Exception(HtmlDocument.HtmlExceptionUseIdAttributeFalse); } return GetId(); } set { if (_ownerdocument._nodesid == null) { throw new Exception(HtmlDocument.HtmlExceptionUseIdAttributeFalse); } if (value == null) { throw new ArgumentNullException("value"); } SetId(value); } } /// /// Gets the line number of this node in the document. /// public int Line { get { return _line; } } /// /// Gets the column number of this node in the document. /// public int LinePosition { get { return _lineposition; } } /// /// Gets the stream position of this node in the document, relative to the start of the document. /// public int StreamPosition { get { return _streamposition; } } /// /// Gets a value indicating if this node has been closed or not. /// public bool Closed { get { return (_endnode != null); } } /// /// Gets or sets this node's name. /// public string Name { get { if (_name == null) { _name = _ownerdocument._text.Substring(_namestartindex, _namelength).ToLower(); } return _name; } set { _name = value; } } /// /// Gets or Sets the text between the start and end tags of the object. /// public virtual string InnerText { get { if (_nodetype == HtmlNodeType.Text) { return ((HtmlTextNode)this).Text; } if (_nodetype == HtmlNodeType.Comment) { return ((HtmlCommentNode)this).Comment; } // note: right now, this method is *slow*, because we recompute everything. // it could be optimised like innerhtml if (!HasChildNodes) { return string.Empty; } string s = null; foreach(HtmlNode node in ChildNodes) { s += node.InnerText; } return s; } } /// /// Gets or Sets the HTML between the start and end tags of the object. /// public virtual string InnerHtml { get { if (_innerchanged) { _innerhtml = WriteContentTo(); _innerchanged = false; return _innerhtml; } if (_innerhtml != null) { return _innerhtml; } if (_innerstartindex < 0) { return string.Empty; } return _ownerdocument._text.Substring(_innerstartindex, _innerlength); } set { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(value); RemoveAllChildren(); AppendChildren(doc.DocumentNode.ChildNodes); } } /// /// Gets or Sets the object and its content in HTML. /// public virtual string OuterHtml { get { if (_outerchanged) { _outerhtml = WriteTo(); _outerchanged = false; return _outerhtml; } if (_outerhtml != null) { return _outerhtml; } if (_outerstartindex < 0) { return string.Empty; } return _ownerdocument._text.Substring(_outerstartindex, _outerlength); } } /// /// Creates a duplicate of the node /// /// public HtmlNode Clone() { return CloneNode(true); } /// /// Creates a duplicate of the node and changes its name at the same time. /// /// The new name of the cloned node. May not be null. /// The cloned node. public HtmlNode CloneNode(string newName) { return CloneNode(newName, true); } /// /// Creates a duplicate of the node and changes its name at the same time. /// /// The new name of the cloned node. May not be null. /// true to recursively clone the subtree under the specified node; false to clone only the node itself. /// The cloned node. public HtmlNode CloneNode(string newName, bool deep) { if (newName == null) { throw new ArgumentNullException("newName"); } HtmlNode node = CloneNode(deep); node._name = newName; return node; } /// /// Creates a duplicate of the node. /// /// true to recursively clone the subtree under the specified node; false to clone only the node itself. /// The cloned node. public HtmlNode CloneNode(bool deep) { HtmlNode node = _ownerdocument.CreateNode(_nodetype); node._name = Name; switch(_nodetype) { case HtmlNodeType.Comment: ((HtmlCommentNode)node).Comment = ((HtmlCommentNode)this).Comment; return node; case HtmlNodeType.Text: ((HtmlTextNode)node).Text = ((HtmlTextNode)this).Text; return node; } // attributes if (HasAttributes) { foreach(HtmlAttribute att in _attributes) { HtmlAttribute newatt = att.Clone(); node.Attributes.Append(newatt); } } // closing attributes if (HasClosingAttributes) { node._endnode = _endnode.CloneNode(false); foreach(HtmlAttribute att in _endnode._attributes) { HtmlAttribute newatt = att.Clone(); node._endnode._attributes.Append(newatt); } } if (!deep) { return node; } if (!HasChildNodes) { return node; } // child nodes foreach(HtmlNode child in _childnodes) { HtmlNode newchild = child.Clone(); node.AppendChild(newchild); } return node; } /// /// Gets the HTML node immediately following this element. /// public HtmlNode NextSibling { get { return _nextnode; } } /// /// Gets the node immediately preceding this node. /// public HtmlNode PreviousSibling { get { return _prevnode; } } /// /// Removes all the children and/or attributes of the current node. /// public void RemoveAll() { RemoveAllChildren(); if (HasAttributes) { _attributes.Clear(); } if ((_endnode != null) && (_endnode != this)) { if (_endnode._attributes != null) { _endnode._attributes.Clear(); } } _outerchanged = true; _innerchanged = true; } /// /// Removes all the children of the current node. /// public void RemoveAllChildren() { if (!HasChildNodes) { return; } if (_ownerdocument.OptionUseIdAttribute) { // remove nodes from id list foreach(HtmlNode node in _childnodes) { _ownerdocument.SetIdForNode(null, node.GetId()); } } _childnodes.Clear(); _outerchanged = true; _innerchanged = true; } /// /// Removes the specified child node. /// /// The node being removed. May not be null. /// The node removed. public HtmlNode RemoveChild(HtmlNode oldChild) { if (oldChild == null) { throw new ArgumentNullException("oldChild"); } int index = -1; if (_childnodes != null) { index = _childnodes[oldChild]; } if (index == -1) { throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild); } _childnodes.Remove(index); _ownerdocument.SetIdForNode(null, oldChild.GetId()); _outerchanged = true; _innerchanged = true; return oldChild; } /// /// Removes the specified child node. /// /// The node being removed. May not be null. /// true to keep grand children of the node, false otherwise. /// The node removed. public HtmlNode RemoveChild(HtmlNode oldChild, bool keepGrandChildren) { if (oldChild == null) { throw new ArgumentNullException("oldChild"); } if ((oldChild._childnodes != null) && keepGrandChildren) { // get prev sibling HtmlNode prev = oldChild.PreviousSibling; // reroute grand children to ourselves foreach(HtmlNode grandchild in oldChild._childnodes) { InsertAfter(grandchild, prev); } } RemoveChild(oldChild); _outerchanged = true; _innerchanged = true; return oldChild; } /// /// Replaces the child node oldChild with newChild node. /// /// The new node to put in the child list. /// The node being replaced in the list. /// The node replaced. public HtmlNode ReplaceChild(HtmlNode newChild, HtmlNode oldChild) { if (newChild == null) { return RemoveChild(oldChild); } if (oldChild == null) { return AppendChild(newChild); } int index = -1; if (_childnodes != null) { index = _childnodes[oldChild]; } if (index == -1) { throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild); } _childnodes.Replace(index, newChild); _ownerdocument.SetIdForNode(null, oldChild.GetId()); _ownerdocument.SetIdForNode(newChild, newChild.GetId()); _outerchanged = true; _innerchanged = true; return newChild; } /// /// Inserts the specified node immediately before the specified reference node. /// /// The node to insert. May not be null. /// The node that is the reference node. The newChild is placed before this node. /// The node being inserted. public HtmlNode InsertBefore(HtmlNode newChild, HtmlNode refChild) { if (newChild == null) { throw new ArgumentNullException("newChild"); } if (refChild == null) { return AppendChild(newChild); } if (newChild == refChild) { return newChild; } int index = -1; if (_childnodes != null) { index = _childnodes[refChild]; } if (index == -1) { throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild); } _childnodes.Insert(index, newChild); _ownerdocument.SetIdForNode(newChild, newChild.GetId()); _outerchanged = true; _innerchanged = true; return newChild; } /// /// Inserts the specified node immediately after the specified reference node. /// /// The node to insert. May not be null. /// The node that is the reference node. The newNode is placed after the refNode. /// The node being inserted. public HtmlNode InsertAfter(HtmlNode newChild, HtmlNode refChild) { if (newChild == null) { throw new ArgumentNullException("newChild"); } if (refChild == null) { return PrependChild(newChild); } if (newChild == refChild) { return newChild; } int index = -1; if (_childnodes != null) { index = _childnodes[refChild]; } if (index == -1) { throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild); } _childnodes.Insert(index + 1, newChild); _ownerdocument.SetIdForNode(newChild, newChild.GetId()); _outerchanged = true; _innerchanged = true; return newChild; } /// /// Gets the first child of the node. /// public HtmlNode FirstChild { get { if (!HasChildNodes) { return null; } return _childnodes[0]; } } /// /// Gets the last child of the node. /// public HtmlNode LastChild { get { if (!HasChildNodes) { return null; } return _childnodes[_childnodes.Count-1]; } } /// /// Gets the type of this node. /// public HtmlNodeType NodeType { get { return _nodetype; } } /// /// Gets the parent of this node (for nodes that can have parents). /// public HtmlNode ParentNode { get { return _parentnode; } } /// /// Gets the HtmlDocument to which this node belongs. /// public HtmlDocument OwnerDocument { get { return _ownerdocument; } } /// /// Gets all the children of the node. /// public HtmlNodeCollection ChildNodes { get { if (_childnodes == null) { _childnodes = new HtmlNodeCollection(this); } return _childnodes; } } /// /// Adds the specified node to the beginning of the list of children of this node. /// /// The node to add. May not be null. /// The node added. public HtmlNode PrependChild(HtmlNode newChild) { if (newChild == null) { throw new ArgumentNullException("newChild"); } ChildNodes.Prepend(newChild); _ownerdocument.SetIdForNode(newChild, newChild.GetId()); _outerchanged = true; _innerchanged = true; return newChild; } /// /// Adds the specified node list to the beginning of the list of children of this node. /// /// The node list to add. May not be null. public void PrependChildren(HtmlNodeCollection newChildren) { if (newChildren == null) { throw new ArgumentNullException("newChildren"); } foreach(HtmlNode newChild in newChildren) { PrependChild(newChild); } } /// /// Adds the specified node to the end of the list of children of this node. /// /// The node to add. May not be null. /// The node added. public HtmlNode AppendChild(HtmlNode newChild) { if (newChild == null) { throw new ArgumentNullException("newChild"); } ChildNodes.Append(newChild); _ownerdocument.SetIdForNode(newChild, newChild.GetId()); _outerchanged = true; _innerchanged = true; return newChild; } /// /// Adds the specified node to the end of the list of children of this node. /// /// The node list to add. May not be null. public void AppendChildren(HtmlNodeCollection newChildren) { if (newChildren == null) throw new ArgumentNullException("newChildrend"); foreach(HtmlNode newChild in newChildren) { AppendChild(newChild); } } /// /// Gets a value indicating whether the current node has any attributes. /// public bool HasAttributes { get { if (_attributes == null) { return false; } if (_attributes.Count <= 0) { return false; } return true; } } /// /// Gets a value indicating whether the current node has any attributes on the closing tag. /// public bool HasClosingAttributes { get { if ((_endnode == null) || (_endnode == this)) { return false; } if (_endnode._attributes == null) { return false; } if (_endnode._attributes.Count <= 0) { return false; } return true; } } /// /// Gets a value indicating whether this node has any child nodes. /// public bool HasChildNodes { get { if (_childnodes == null) { return false; } if (_childnodes.Count <= 0) { return false; } return true; } } /// /// Helper method to get the value of an attribute of this node. If the attribute is not found, the default value will be returned. /// /// The name of the attribute to get. May not be null. /// The default value to return if not found. /// The value of the attribute if found, the default value if not found. public string GetAttributeValue(string name, string def) { if (name == null) { throw new ArgumentNullException("name"); } if (!HasAttributes) { return def; } HtmlAttribute att = Attributes[name]; if (att == null) { return def; } return att.Value; } /// /// Helper method to get the value of an attribute of this node. If the attribute is not found, the default value will be returned. /// /// The name of the attribute to get. May not be null. /// The default value to return if not found. /// The value of the attribute if found, the default value if not found. public int GetAttributeValue(string name, int def) { if (name == null) { throw new ArgumentNullException("name"); } if (!HasAttributes) { return def; } HtmlAttribute att = Attributes[name]; if (att == null) { return def; } try { return Convert.ToInt32(att.Value); } catch { return def; } } /// /// Helper method to get the value of an attribute of this node. If the attribute is not found, the default value will be returned. /// /// The name of the attribute to get. May not be null. /// The default value to return if not found. /// The value of the attribute if found, the default value if not found. public bool GetAttributeValue(string name, bool def) { if (name == null) { throw new ArgumentNullException("name"); } if (!HasAttributes) { return def; } HtmlAttribute att = Attributes[name]; if (att == null) { return def; } try { return Convert.ToBoolean(att.Value); } catch { return def; } } /// /// Helper method to set the value of an attribute of this node. If the attribute is not found, it will be created automatically. /// /// The name of the attribute to set. May not be null. /// The value for the attribute. /// The corresponding attribute instance. public HtmlAttribute SetAttributeValue(string name, string value) { if (name == null) { throw new ArgumentNullException("name"); } HtmlAttribute att = Attributes[name]; if (att == null) { return Attributes.Append(_ownerdocument.CreateAttribute(name, value)); } att.Value = value; return att; } /// /// Gets the collection of HTML attributes for this node. May not be null. /// public HtmlAttributeCollection Attributes { get { if (!HasAttributes) { _attributes = new HtmlAttributeCollection(this); } return _attributes; } } /// /// Gets the collection of HTML attributes for the closing tag. May not be null. /// public HtmlAttributeCollection ClosingAttributes { get { if (!HasClosingAttributes) { return new HtmlAttributeCollection(this); } return _endnode.Attributes; } } internal void WriteAttribute(TextWriter outText, HtmlAttribute att) { string name; if (_ownerdocument.OptionOutputAsXml) { if (_ownerdocument.OptionOutputUpperCase) { name = att.XmlName.ToUpper(); } else { name = att.XmlName; } outText.Write(" " + name + "=\"" + HtmlDocument.HtmlEncode(att.XmlValue) + "\""); } else { if (_ownerdocument.OptionOutputUpperCase) { name = att.Name.ToUpper(); } else { name = att.Name; } if (att.Name.Length >= 4) { if ((att.Name[0] == '<') && (att.Name[1] == '%') && (att.Name[att.Name.Length-1] == '>') && (att.Name[att.Name.Length-2] == '%')) { outText.Write(" " + name); return; } } if (_ownerdocument.OptionOutputOptimizeAttributeValues) { if (att.Value.IndexOfAny(new Char[]{(char)10, (char)13, (char)9, ' '}) < 0) { outText.Write(" " + name + "=" + att.Value); } else { outText.Write(" " + name + "=\"" + att.Value + "\""); } } else { outText.Write(" " + name + "=\"" + att.Value + "\""); } } } internal static void WriteAttributes(XmlWriter writer, HtmlNode node) { if (!node.HasAttributes) { return; } // we use _hashitems to make sure attributes are written only once foreach(HtmlAttribute att in node.Attributes._hashitems.Values) { writer.WriteAttributeString(att.XmlName, att.Value); } } internal void WriteAttributes(TextWriter outText, bool closing) { if (_ownerdocument.OptionOutputAsXml) { if (_attributes == null) { return; } // we use _hashitems to make sure attributes are written only once foreach(HtmlAttribute att in _attributes._hashitems.Values) { WriteAttribute(outText, att); } return; } if (!closing) { if (_attributes != null) { foreach(HtmlAttribute att in _attributes) { WriteAttribute(outText, att); } } if (_ownerdocument.OptionAddDebuggingAttributes) { WriteAttribute(outText, _ownerdocument.CreateAttribute("_closed", Closed.ToString())); WriteAttribute(outText, _ownerdocument.CreateAttribute("_children", ChildNodes.Count.ToString())); int i = 0; foreach(HtmlNode n in ChildNodes) { WriteAttribute(outText, _ownerdocument.CreateAttribute("_child_" + i, n.Name)); i++; } } } else { if (_endnode == null) { return; } if (_endnode._attributes == null) { return; } if (_endnode == this) { return; } foreach(HtmlAttribute att in _endnode._attributes) { WriteAttribute(outText, att); } if (_ownerdocument.OptionAddDebuggingAttributes) { WriteAttribute(outText, _ownerdocument.CreateAttribute("_closed", Closed.ToString())); WriteAttribute(outText, _ownerdocument.CreateAttribute("_children", ChildNodes.Count.ToString())); } } } internal static string GetXmlComment(HtmlCommentNode comment) { string s = comment.Comment; return s.Substring(4, s.Length-7).Replace("--", " - -"); } /// /// Saves the current node to the specified TextWriter. /// /// The TextWriter to which you want to save. public void WriteTo(TextWriter outText) { string html; switch(_nodetype) { case HtmlNodeType.Comment: html = ((HtmlCommentNode)this).Comment; if (_ownerdocument.OptionOutputAsXml) { outText.Write(""); } else { outText.Write(html); } break; case HtmlNodeType.Document: if (_ownerdocument.OptionOutputAsXml) { outText.Write(""); // check there is a root element if (_ownerdocument.DocumentNode.HasChildNodes) { int rootnodes = _ownerdocument.DocumentNode._childnodes.Count; if (rootnodes > 0) { HtmlNode xml = _ownerdocument.GetXmlDeclaration(); if (xml != null) { rootnodes --; } if (rootnodes > 1) { if (_ownerdocument.OptionOutputUpperCase) { outText.Write(""); WriteContentTo(outText); outText.Write(""); } else { outText.Write(""); WriteContentTo(outText); outText.Write(""); } break; } } } } WriteContentTo(outText); break; case HtmlNodeType.Text: html = ((HtmlTextNode)this).Text; if (_ownerdocument.OptionOutputAsXml) { outText.Write(HtmlDocument.HtmlEncode(html)); } else { outText.Write(html); } break; case HtmlNodeType.Element: string name; if (_ownerdocument.OptionOutputUpperCase) { name = Name.ToUpper(); } else { name = Name; } if (_ownerdocument.OptionOutputAsXml) { if (name.Length > 0) { if (name[0] == '?') { // forget this one, it's been done at the document level break; } if (name.Trim().Length == 0) { break; } name = HtmlDocument.GetXmlName(name); } else { break; } } outText.Write("<" + name); WriteAttributes(outText, false); if (!HasChildNodes) { if (HtmlNode.IsEmptyElement(Name)) { if ((_ownerdocument.OptionWriteEmptyNodes) || (_ownerdocument.OptionOutputAsXml)) { outText.Write(" />"); } else { if (Name.Length > 0) { if (Name[0] == '?') { outText.Write("?"); } } outText.Write(">"); } } else { outText.Write(">"); } } else { outText.Write(">"); bool cdata = false; if (_ownerdocument.OptionOutputAsXml) { if (HtmlNode.IsCDataElement(Name)) { // this code and the following tries to output things as nicely as possible for old browsers. cdata = true; outText.Write("\r\n////\r\n"); } else { WriteContentTo(outText); } outText.Write(""); } break; } } /// /// Saves the current node to the specified XmlWriter. /// /// The XmlWriter to which you want to save. public void WriteTo(XmlWriter writer) { string html; switch(_nodetype) { case HtmlNodeType.Comment: writer.WriteComment(GetXmlComment((HtmlCommentNode)this)); break; case HtmlNodeType.Document: writer.WriteProcessingInstruction("xml", "version=\"1.0\" encoding=\"" + _ownerdocument.GetOutEncoding().BodyName + "\""); if (HasChildNodes) { foreach(HtmlNode subnode in ChildNodes) { subnode.WriteTo(writer); } } break; case HtmlNodeType.Text: html = ((HtmlTextNode)this).Text; writer.WriteString(html); break; case HtmlNodeType.Element: string name; if (_ownerdocument.OptionOutputUpperCase) { name = Name.ToUpper(); } else { name = Name; } writer.WriteStartElement(name); WriteAttributes(writer, this); if (HasChildNodes) { foreach(HtmlNode subnode in ChildNodes) { subnode.WriteTo(writer); } } writer.WriteEndElement(); break; } } /// /// Saves all the children of the node to the specified TextWriter. /// /// The TextWriter to which you want to save. public void WriteContentTo(TextWriter outText) { if (_childnodes == null) { return; } foreach(HtmlNode node in _childnodes) { node.WriteTo(outText); } } /// /// Saves the current node to a string. /// /// The saved string. public string WriteTo() { StringWriter sw = new StringWriter(); WriteTo(sw); sw.Flush(); return sw.ToString(); } /// /// Saves all the children of the node to a string. /// /// The saved string. public string WriteContentTo() { StringWriter sw = new StringWriter(); WriteContentTo(sw); sw.Flush(); return sw.ToString(); } } /// /// Represents a combined list and collection of HTML nodes. /// public class HtmlNodeCollection: IEnumerable { private ArrayList _items = new ArrayList(); private HtmlNode _parentnode; internal HtmlNodeCollection(HtmlNode parentnode) { _parentnode = parentnode; // may be null } /// /// Gets the number of elements actually contained in the list. /// public int Count { get { return _items.Count; } } internal void Clear() { foreach(HtmlNode node in _items) { node._parentnode = null; node._nextnode = null; node._prevnode = null; } _items.Clear(); } internal void Remove(int index) { HtmlNode next = null; HtmlNode prev = null; HtmlNode oldnode = (HtmlNode)_items[index]; if (index > 0) { prev = (HtmlNode)_items[index-1]; } if (index < (_items.Count-1)) { next = (HtmlNode)_items[index+1]; } _items.RemoveAt(index); if (prev != null) { if (next == prev) { throw new InvalidProgramException("Unexpected error."); } prev._nextnode = next; } if (next != null) { next._prevnode = prev; } oldnode._prevnode = null; oldnode._nextnode = null; oldnode._parentnode = null; } internal void Replace(int index, HtmlNode node) { HtmlNode next = null; HtmlNode prev = null; HtmlNode oldnode = (HtmlNode)_items[index]; if (index>0) { prev = (HtmlNode)_items[index-1]; } if (index<(_items.Count-1)) { next = (HtmlNode)_items[index+1]; } _items[index] = node; if (prev != null) { if (node == prev) { throw new InvalidProgramException("Unexpected error."); } prev._nextnode = node; } if (next!=null) { next._prevnode = node; } node._prevnode = prev; if (next == node) { throw new InvalidProgramException("Unexpected error."); } node._nextnode = next; node._parentnode = _parentnode; oldnode._prevnode = null; oldnode._nextnode = null; oldnode._parentnode = null; } internal void Insert(int index, HtmlNode node) { HtmlNode next = null; HtmlNode prev = null; if (index>0) { prev = (HtmlNode)_items[index-1]; } if (index<_items.Count) { next = (HtmlNode)_items[index]; } _items.Insert(index, node); if (prev != null) { if (node == prev) { throw new InvalidProgramException("Unexpected error."); } prev._nextnode = node; } if (next != null) { next._prevnode = node; } node._prevnode = prev; if (next == node) { throw new InvalidProgramException("Unexpected error."); } node._nextnode = next; node._parentnode = _parentnode; } internal void Append(HtmlNode node) { HtmlNode last = null; if (_items.Count > 0) { last = (HtmlNode)_items[_items.Count-1]; } _items.Add(node); node._prevnode = last; node._nextnode = null; node._parentnode = _parentnode; if (last != null) { if (last == node) { throw new InvalidProgramException("Unexpected error."); } last._nextnode = node; } } internal void Prepend(HtmlNode node) { HtmlNode first = null; if (_items.Count > 0) { first = (HtmlNode)_items[0]; } _items.Insert(0, node); if (node == first) { throw new InvalidProgramException("Unexpected error."); } node._nextnode = first; node._prevnode = null; node._parentnode = _parentnode; if (first != null) { first._prevnode = node; } } internal void Add(HtmlNode node) { _items.Add(node); } /// /// Gets the node at the specified index. /// public HtmlNode this[int index] { get { return _items[index] as HtmlNode; } } internal int GetNodeIndex(HtmlNode node) { // TODO: should we rewrite this? what would be the key of a node? for(int i=0;i<_items.Count;i++) { if (node == ((HtmlNode)_items[i])) { return i; } } return -1; } /// /// Gets a given node from the list. /// public int this[HtmlNode node] { get { int index = GetNodeIndex(node); if (index == -1) { throw new ArgumentOutOfRangeException("node", "Node \"" + node.CloneNode(false).OuterHtml + "\" was not found in the collection"); } return index; } } /// /// Returns an enumerator that can iterate through the list. /// /// An IEnumerator for the entire list. public HtmlNodeEnumerator GetEnumerator() { return new HtmlNodeEnumerator(_items); } IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); } /// /// Represents an enumerator that can iterate through the list. /// public class HtmlNodeEnumerator: IEnumerator { int _index; ArrayList _items; internal HtmlNodeEnumerator(ArrayList items) { _items = items; _index = -1; } /// /// Sets the enumerator to its initial position, which is before the first element in the collection. /// public void Reset() { _index = -1; } /// /// Advances the enumerator to the next element of the collection. /// /// true if the enumerator was successfully advanced to the next element, false if the enumerator has passed the end of the collection. public bool MoveNext() { _index++; return (_index<_items.Count); } /// /// Gets the current element in the collection. /// public HtmlNode Current { get { return (HtmlNode)(_items[_index]); } } /// /// Gets the current element in the collection. /// object IEnumerator.Current { get { return (Current); } } } } /// /// Represents an HTML text node. /// public class HtmlTextNode: HtmlNode { private string _text; internal HtmlTextNode(HtmlDocument ownerdocument, int index): base(HtmlNodeType.Text, ownerdocument, index) { } /// /// Gets or Sets the HTML between the start and end tags of the object. In the case of a text node, it is equals to OuterHtml. /// public override string InnerHtml { get { return OuterHtml; } set { _text = value; } } /// /// Gets or Sets the object and its content in HTML. /// public override string OuterHtml { get { if (_text == null) { return base.OuterHtml; } return _text; } } /// /// Gets or Sets the text of the node. /// public string Text { get { if (_text == null) { return base.OuterHtml; } return _text; } set { _text = value; } } } /// /// Represents an HTML comment. /// public class HtmlCommentNode: HtmlNode { private string _comment; internal HtmlCommentNode(HtmlDocument ownerdocument, int index): base(HtmlNodeType.Comment, ownerdocument, index) { } /// /// Gets or Sets the HTML between the start and end tags of the object. In the case of a text node, it is equals to OuterHtml. /// public override string InnerHtml { get { if (_comment == null) { return base.InnerHtml; } return _comment; } set { _comment = value; } } /// /// Gets or Sets the object and its content in HTML. /// public override string OuterHtml { get { if (_comment == null) { return base.OuterHtml; } return ""; } } /// /// Gets or Sets the comment text of the node. /// public string Comment { get { if (_comment == null) { return base.InnerHtml; } return _comment; } set { _comment = value; } } } }