| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079 |
- // HtmlAgilityPack V1.0 - Simon Mourier <[email protected]>
- using System;
- using System.IO;
- using System.Text;
- using System.Diagnostics;
- using System.Collections;
- using System.Text.RegularExpressions;
- using System.Xml;
- using System.Xml.XPath;
- namespace HtmlAgilityPack
- {
- /// <summary>
- /// Represents the type of parsing error.
- /// </summary>
- public enum HtmlParseErrorCode
- {
- /// <summary>
- /// A tag was not closed.
- /// </summary>
- TagNotClosed,
- /// <summary>
- /// A tag was not opened.
- /// </summary>
- TagNotOpened,
- /// <summary>
- /// There is a charset mismatch between stream and declared (META) encoding.
- /// </summary>
- CharsetMismatch,
- /// <summary>
- /// An end tag was not required.
- /// </summary>
- EndTagNotRequired,
- /// <summary>
- /// An end tag is invalid at this position.
- /// </summary>
- EndTagInvalidHere
- }
- /// <summary>
- /// Represents a parsing error found during document parsing.
- /// </summary>
- public class HtmlParseError
- {
- private HtmlParseErrorCode _code;
- private int _line;
- private int _linePosition;
- private int _streamPosition;
- private string _sourceText;
- private string _reason;
- internal HtmlParseError(
- HtmlParseErrorCode code,
- int line,
- int linePosition,
- int streamPosition,
- string sourceText,
- string reason)
- {
- _code = code;
- _line = line;
- _linePosition = linePosition;
- _streamPosition = streamPosition;
- _sourceText = sourceText;
- _reason = reason;
- }
- /// <summary>
- /// Gets the type of error.
- /// </summary>
- public HtmlParseErrorCode Code
- {
- get
- {
- return _code;
- }
- }
- /// <summary>
- /// Gets the line number of this error in the document.
- /// </summary>
- public int Line
- {
- get
- {
- return _line;
- }
- }
- /// <summary>
- /// Gets the column number of this error in the document.
- /// </summary>
- public int LinePosition
- {
- get
- {
- return _linePosition;
- }
- }
- /// <summary>
- /// Gets the absolute stream position of this error in the document, relative to the start of the document.
- /// </summary>
- public int StreamPosition
- {
- get
- {
- return _streamPosition;
- }
- }
- /// <summary>
- /// Gets the the full text of the line containing the error.
- /// </summary>
- public string SourceText
- {
- get
- {
- return _sourceText;
- }
- }
- /// <summary>
- /// Gets a description for the error.
- /// </summary>
- public string Reason
- {
- get
- {
- return _reason;
- }
- }
- }
-
- /// <summary>
- /// Represents a complete HTML document.
- /// </summary>
- public class HtmlDocument: IXPathNavigable
- {
- internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
- internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature";
- internal Hashtable _openednodes;
- internal Hashtable _lastnodes = new Hashtable();
- internal Hashtable _nodesid;
- private HtmlNode _documentnode;
- internal string _text;
- private string _remainder;
- private int _remainderOffset;
- private HtmlNode _currentnode;
- private HtmlNode _lastparentnode;
- private HtmlAttribute _currentattribute;
- private int _index;
- private int _line;
- private int _lineposition, _maxlineposition;
- private int _c;
- private bool _fullcomment;
- private System.Text.Encoding _streamencoding;
- private System.Text.Encoding _declaredencoding;
- private ArrayList _parseerrors = new ArrayList();
- private ParseState _state, _oldstate;
- private Crc32 _crc32 = null;
- private bool _onlyDetectEncoding = false;
- // public props
- /// <summary>
- /// Defines if a checksum must be computed for the document while parsing. Default is false.
- /// </summary>
- public bool OptionComputeChecksum = false;
- /// <summary>
- /// Defines if declared encoding must be read from the document.
- /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
- /// Default is true.
- /// </summary>
- public bool OptionReadEncoding = true;
- /// <summary>
- /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
- /// </summary>
- public bool OptionCheckSyntax = true;
- /// <summary>
- /// Defines if the 'id' attribute must be specifically used. Default is true.
- /// </summary>
- public bool OptionUseIdAttribute = true;
- /// <summary>
- /// Defines if empty nodes must be written as closed during output. Default is false.
- /// </summary>
- public bool OptionWriteEmptyNodes = false;
- /// <summary>
- /// Defines if output must conform to XML, instead of HTML.
- /// </summary>
- public bool OptionOutputAsXml = false;
- /// <summary>
- /// Defines if name must be output in uppercase. Default is false.
- /// </summary>
- public bool OptionOutputUpperCase = false;
- /// <summary>
- /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
- /// </summary>
- public bool OptionOutputOptimizeAttributeValues = false;
- /// <summary>
- /// Adds Debugging attributes to node. Default is false.
- /// </summary>
- public bool OptionAddDebuggingAttributes = false;
- /// <summary>
- /// Defines if source text must be extracted while parsing errors.
- /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
- /// Default is false.
- /// </summary>
- public bool OptionExtractErrorSourceText = false; // turning this on can dramatically slow performance if a lot of errors are detected
- /// <summary>
- /// Defines if closing for non closed nodes must be done at the end or directly in the document.
- /// Setting this to true can actually change how browsers render the page. Default is false.
- /// </summary>
- public bool OptionAutoCloseOnEnd = false; // close errors at the end
- /// <summary>
- /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
- /// </summary>
- public bool OptionFixNestedTags = false; // fix li, tr, th, td tags
- /// <summary>
- /// Defines the maximum length of source text or parse errors. Default is 100.
- /// </summary>
- public int OptionExtractErrorSourceTextMaxLength = 100;
- /// <summary>
- /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
- /// </summary>
- public System.Text.Encoding OptionDefaultStreamEncoding = System.Text.Encoding.Default;
- /// <summary>
- /// Defines the name of a node that will throw the StopperNodeException when found as an end node. Default is null.
- /// </summary>
- public string OptionStopperNodeName = null;
- /// <summary>
- /// Gets the remaining text.
- /// Will always be null if OptionStopperNodeName is null.
- /// </summary>
- public string Remainder
- {
- get
- {
- return _remainder;
- }
- }
- /// <summary>
- /// Gets the offset of Remainder in the original Html text.
- /// If OptionStopperNodeName is null, this will return the length of the original Html text.
- /// </summary>
- public int RemainderOffset
- {
- get
- {
- return _remainderOffset;
- }
- }
- /// <summary>
- /// Gets a list of parse errors found in the document.
- /// </summary>
- public ArrayList ParseErrors
- {
- get
- {
- return _parseerrors;
- }
- }
- /// <summary>
- /// Gets the document's stream encoding.
- /// </summary>
- public System.Text.Encoding StreamEncoding
- {
- get
- {
- return _streamencoding;
- }
- }
- /// <summary>
- /// Gets the document's declared encoding.
- /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
- /// </summary>
- public System.Text.Encoding DeclaredEncoding
- {
- get
- {
- return _declaredencoding;
- }
- }
- /// <summary>
- /// Creates an instance of an HTML document.
- /// </summary>
- public HtmlDocument()
- {
- _documentnode = CreateNode(HtmlNodeType.Document, 0);
- }
- internal HtmlNode GetXmlDeclaration()
- {
- if (!_documentnode.HasChildNodes)
- {
- return null;
- }
- foreach(HtmlNode node in _documentnode._childnodes)
- {
- if (node.Name == "?xml") // it's ok, names are case sensitive
- {
- return node;
- }
- }
- return null;
- }
- /// <summary>
- /// Applies HTML encoding to a specified string.
- /// </summary>
- /// <param name="html">The input string to encode. May not be null.</param>
- /// <returns>The encoded string.</returns>
- public static string HtmlEncode(string html)
- {
- if (html == null)
- {
- throw new ArgumentNullException("html");
- }
- // replace & by & but only once!
- Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
- return rx.Replace(html, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """);
- }
- /// <summary>
- /// Detects the encoding of an HTML stream.
- /// </summary>
- /// <param name="stream">The input stream. May not be null.</param>
- /// <returns>The detected encoding.</returns>
- public Encoding DetectEncoding(Stream stream)
- {
- if (stream == null)
- {
- throw new ArgumentNullException("stream");
- }
- return DetectEncoding(new StreamReader(stream));
- }
- /// <summary>
- /// Detects the encoding of an HTML file.
- /// </summary>
- /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
- /// <returns>The detected encoding.</returns>
- public Encoding DetectEncoding(string path)
- {
- if (path == null)
- {
- throw new ArgumentNullException("path");
- }
- StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
- Encoding encoding = DetectEncoding(sr);
- sr.Close();
- return encoding;
- }
- /// <summary>
- /// Detects the encoding of an HTML text.
- /// </summary>
- /// <param name="html">The input html text. May not be null.</param>
- /// <returns>The detected encoding.</returns>
- public Encoding DetectEncodingHtml(string html)
- {
- if (html == null)
- {
- throw new ArgumentNullException("html");
- }
- StringReader sr = new StringReader(html);
- Encoding encoding = DetectEncoding(sr);
- sr.Close();
- return encoding;
- }
- /// <summary>
- /// Detects the encoding of an HTML text provided on a TextReader.
- /// </summary>
- /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
- /// <returns>The detected encoding.</returns>
- public Encoding DetectEncoding(TextReader reader)
- {
- if (reader == null)
- {
- throw new ArgumentNullException("reader");
- }
- _onlyDetectEncoding = true;
- if (OptionCheckSyntax)
- {
- _openednodes = new Hashtable();
- }
- else
- {
- _openednodes = null;
- }
- if (OptionUseIdAttribute)
- {
- _nodesid = new Hashtable();
- }
- else
- {
- _nodesid = null;
- }
- StreamReader sr = reader as StreamReader;
- if (sr != null)
- {
- _streamencoding = sr.CurrentEncoding;
- }
- else
- {
- _streamencoding = null;
- }
- _declaredencoding = null;
- _text = reader.ReadToEnd();
- _documentnode = CreateNode(HtmlNodeType.Document, 0);
- // this is almost a hack, but it allows us not to muck with the original parsing code
- try
- {
- Parse();
- }
- catch(EncodingFoundException ex)
- {
- return ex.Encoding;
- }
- return null;
- }
- /// <summary>
- /// Loads an HTML document from a stream.
- /// </summary>
- /// <param name="stream">The input stream.</param>
- public void Load(Stream stream)
- {
- Load(new StreamReader(stream, OptionDefaultStreamEncoding));
- }
- /// <summary>
- /// Loads an HTML document from a stream.
- /// </summary>
- /// <param name="stream">The input stream.</param>
- /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
- public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
- {
- Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
- }
- /// <summary>
- /// Loads an HTML document from a stream.
- /// </summary>
- /// <param name="stream">The input stream.</param>
- /// <param name="encoding">The character encoding to use.</param>
- public void Load(Stream stream, Encoding encoding)
- {
- Load(new StreamReader(stream, encoding));
- }
- /// <summary>
- /// Loads an HTML document from a stream.
- /// </summary>
- /// <param name="stream">The input stream.</param>
- /// <param name="encoding">The character encoding to use.</param>
- /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
- public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
- {
- Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
- }
- /// <summary>
- /// Loads an HTML document from a stream.
- /// </summary>
- /// <param name="stream">The input stream.</param>
- /// <param name="encoding">The character encoding to use.</param>
- /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
- /// <param name="buffersize">The minimum buffer size.</param>
- public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
- {
- Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
- }
- /// <summary>
- /// Loads an HTML document from a file.
- /// </summary>
- /// <param name="path">The complete file path to be read. May not be null.</param>
- public void Load(string path)
- {
- if (path == null)
- {
- throw new ArgumentNullException("path");
- }
- StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
- Load(sr);
- sr.Close();
- }
- /// <summary>
- /// Loads an HTML document from a file.
- /// </summary>
- /// <param name="path">The complete file path to be read. May not be null.</param>
- /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
- public void Load(string path, bool detectEncodingFromByteOrderMarks)
- {
- if (path == null)
- {
- throw new ArgumentNullException("path");
- }
- StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
- Load(sr);
- sr.Close();
- }
- /// <summary>
- /// Loads an HTML document from a file.
- /// </summary>
- /// <param name="path">The complete file path to be read. May not be null.</param>
- /// <param name="encoding">The character encoding to use. May not be null.</param>
- public void Load(string path, Encoding encoding)
- {
- if (path == null)
- {
- throw new ArgumentNullException("path");
- }
- if (encoding == null)
- {
- throw new ArgumentNullException("encoding");
- }
- StreamReader sr = new StreamReader(path, encoding);
- Load(sr);
- sr.Close();
- }
- /// <summary>
- /// Loads an HTML document from a file.
- /// </summary>
- /// <param name="path">The complete file path to be read. May not be null.</param>
- /// <param name="encoding">The character encoding to use. May not be null.</param>
- /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
- public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
- {
- if (path == null)
- {
- throw new ArgumentNullException("path");
- }
- if (encoding == null)
- {
- throw new ArgumentNullException("encoding");
- }
- StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
- Load(sr);
- sr.Close();
- }
- /// <summary>
- /// Loads an HTML document from a file.
- /// </summary>
- /// <param name="path">The complete file path to be read. May not be null.</param>
- /// <param name="encoding">The character encoding to use. May not be null.</param>
- /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
- /// <param name="buffersize">The minimum buffer size.</param>
- public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
- {
- if (path == null)
- {
- throw new ArgumentNullException("path");
- }
- if (encoding == null)
- {
- throw new ArgumentNullException("encoding");
- }
- StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
- Load(sr);
- sr.Close();
- }
- /// <summary>
- /// Loads the HTML document from the specified string.
- /// </summary>
- /// <param name="html">String containing the HTML document to load. May not be null.</param>
- public void LoadHtml(string html)
- {
- if (html == null)
- {
- throw new ArgumentNullException("html");
- }
- StringReader sr = new StringReader(html);
- Load(sr);
- sr.Close();
- }
- /// <summary>
- /// Detects the encoding of an HTML document from a file first, and then loads the file.
- /// </summary>
- /// <param name="path">The complete file path to be read.</param>
- public void DetectEncodingAndLoad(string path)
- {
- DetectEncodingAndLoad(path, true);
- }
- /// <summary>
- /// Detects the encoding of an HTML document from a file first, and then loads the file.
- /// </summary>
- /// <param name="path">The complete file path to be read. May not be null.</param>
- /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
- public void DetectEncodingAndLoad(string path, bool detectEncoding)
- {
- if (path == null)
- {
- throw new ArgumentNullException("path");
- }
- System.Text.Encoding enc;
- if (detectEncoding)
- {
- enc = DetectEncoding(path);
- }
- else
- {
- enc = null;
- }
- if (enc == null)
- {
- Load(path);
- }
- else
- {
- Load(path, enc);
- }
- }
- /// <summary>
- /// Loads the HTML document from the specified TextReader.
- /// </summary>
- /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
- public void Load(TextReader reader)
- {
- // all Load methods pass down to this one
- if (reader == null)
- {
- throw new ArgumentNullException("reader");
- }
- _onlyDetectEncoding = false;
- if (OptionCheckSyntax)
- {
- _openednodes = new Hashtable();
- }
- else
- {
- _openednodes = null;
- }
- if (OptionUseIdAttribute)
- {
- _nodesid = new Hashtable();
- }
- else
- {
- _nodesid = null;
- }
- StreamReader sr = reader as StreamReader;
- if (sr != null)
- {
- try
- {
- // trigger bom read if needed
- sr.Peek();
- }
- catch
- {
- // void on purpose
- }
- _streamencoding = sr.CurrentEncoding;
- }
- else
- {
- _streamencoding = null;
- }
- _declaredencoding = null;
- _text = reader.ReadToEnd();
- _documentnode = CreateNode(HtmlNodeType.Document, 0);
- Parse();
- if (OptionCheckSyntax)
- {
- foreach(HtmlNode node in _openednodes.Values)
- {
- if (!node._starttag) // already reported
- {
- continue;
- }
- string html;
- if (OptionExtractErrorSourceText)
- {
- html = node.OuterHtml;
- if (html.Length > OptionExtractErrorSourceTextMaxLength)
- {
- html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
- }
- }
- else
- {
- html = string.Empty;
- }
- AddError(
- HtmlParseErrorCode.TagNotClosed,
- node._line, node._lineposition,
- node._streamposition, html,
- "End tag </" + node.Name + "> was not found");
- }
- // we don't need this anymore
- _openednodes.Clear();
- }
- }
- internal System.Text.Encoding GetOutEncoding()
- {
- // when unspecified, use the stream encoding first
- if (_declaredencoding != null)
- {
- return _declaredencoding;
- }
- else
- {
- if (_streamencoding != null)
- {
- return _streamencoding;
- }
- }
- return OptionDefaultStreamEncoding;
- }
- /// <summary>
- /// Gets the document's output encoding.
- /// </summary>
- public System.Text.Encoding Encoding
- {
- get
- {
- return GetOutEncoding();
- }
- }
- /// <summary>
- /// Saves the HTML document to the specified stream.
- /// </summary>
- /// <param name="outStream">The stream to which you want to save.</param>
- public void Save(Stream outStream)
- {
- StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
- Save(sw);
- }
- /// <summary>
- /// Saves the HTML document to the specified stream.
- /// </summary>
- /// <param name="outStream">The stream to which you want to save. May not be null.</param>
- /// <param name="encoding">The character encoding to use. May not be null.</param>
- public void Save(Stream outStream, System.Text.Encoding encoding)
- {
- if (outStream == null)
- {
- throw new ArgumentNullException("outStream");
- }
- if (encoding == null)
- {
- throw new ArgumentNullException("encoding");
- }
- StreamWriter sw = new StreamWriter(outStream, encoding);
- Save(sw);
- }
- /// <summary>
- /// Saves the mixed document to the specified file.
- /// </summary>
- /// <param name="filename">The location of the file where you want to save the document.</param>
- public void Save(string filename)
- {
- StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
- Save(sw);
- sw.Close();
- }
- /// <summary>
- /// Saves the mixed document to the specified file.
- /// </summary>
- /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
- /// <param name="encoding">The character encoding to use. May not be null.</param>
- public void Save(string filename, System.Text.Encoding encoding)
- {
- if (filename == null)
- {
- throw new ArgumentNullException("filename");
- }
- if (encoding == null)
- {
- throw new ArgumentNullException("encoding");
- }
- StreamWriter sw = new StreamWriter(filename, false, encoding);
- Save(sw);
- sw.Close();
- }
- /// <summary>
- /// Saves the HTML document to the specified StreamWriter.
- /// </summary>
- /// <param name="writer">The StreamWriter to which you want to save.</param>
- public void Save(StreamWriter writer)
- {
- Save((TextWriter)writer);
- }
- /// <summary>
- /// Saves the HTML document to the specified TextWriter.
- /// </summary>
- /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
- public void Save(TextWriter writer)
- {
- if (writer == null)
- {
- throw new ArgumentNullException("writer");
- }
- DocumentNode.WriteTo(writer);
- }
- /// <summary>
- /// Saves the HTML document to the specified XmlWriter.
- /// </summary>
- /// <param name="writer">The XmlWriter to which you want to save.</param>
- public void Save(XmlWriter writer)
- {
- DocumentNode.WriteTo(writer);
- writer.Flush();
- }
- /// <summary>
- /// Creates a new XPathNavigator object for navigating this HTML document.
- /// </summary>
- /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
- public XPathNavigator CreateNavigator()
- {
- return new HtmlNodeNavigator(this, _documentnode);
- }
- /// <summary>
- /// Gets a valid XML name.
- /// </summary>
- /// <param name="name">Any text.</param>
- /// <returns>A string that is a valid XML name.</returns>
- public static string GetXmlName(string name)
- {
- string xmlname = string.Empty;
- bool nameisok = true;
- for(int i=0;i<name.Length;i++)
- {
- // names are lcase
- // note: we are very limited here, too much?
- if (((name[i]>='a') && (name[i]<='z')) ||
- ((name[i]>='0') && (name[i]<='9')) ||
- // (name[i]==':') || (name[i]=='_') || (name[i]=='-') || (name[i]=='.')) // these are bads in fact
- (name[i]=='_') || (name[i]=='-') || (name[i]=='.'))
- {
- xmlname += name[i];
- }
- else
- {
- nameisok = false;
- byte[] bytes = System.Text.Encoding.UTF8.GetBytes(new char[]{name[i]});
- for(int j=0;j<bytes.Length;j++)
- {
- xmlname += bytes[j].ToString("x2");
- }
- xmlname += "_";
- }
-
- }
- if (nameisok)
- {
- return xmlname;
- }
- return "_" + xmlname;
- }
- internal void SetIdForNode(HtmlNode node, string id)
- {
- if (!OptionUseIdAttribute)
- {
- return;
- }
- if ((_nodesid == null) || (id == null))
- {
- return;
- }
- if (node == null)
- {
- _nodesid.Remove(id.ToLower());
- }
- else
- {
- _nodesid[id.ToLower()] = node;
- }
- }
- /// <summary>
- /// Gets the HTML node with the specified 'id' attribute value.
- /// </summary>
- /// <param name="id">The attribute id to match. May not be null.</param>
- /// <returns>The HTML node with the matching id or null if not found.</returns>
- public HtmlNode GetElementbyId(string id)
- {
- if (id == null)
- {
- throw new ArgumentNullException("id");
- }
- if (_nodesid == null)
- {
- throw new Exception(HtmlExceptionUseIdAttributeFalse);
- }
- return _nodesid[id.ToLower()] as HtmlNode;
- }
- /// <summary>
- /// Creates an HTML element node with the specified name.
- /// </summary>
- /// <param name="name">The qualified name of the element. May not be null.</param>
- /// <returns>The new HTML node.</returns>
- public HtmlNode CreateElement(string name)
- {
- if (name == null)
- {
- throw new ArgumentNullException("name");
- }
- HtmlNode node = CreateNode(HtmlNodeType.Element);
- node._name = name;
- return node;
- }
- /// <summary>
- /// Creates an HTML comment node.
- /// </summary>
- /// <returns>The new HTML comment node.</returns>
- public HtmlCommentNode CreateComment()
- {
- return (HtmlCommentNode)CreateNode(HtmlNodeType.Comment);
- }
- /// <summary>
- /// Creates an HTML comment node with the specified comment text.
- /// </summary>
- /// <param name="comment">The comment text. May not be null.</param>
- /// <returns>The new HTML comment node.</returns>
- public HtmlCommentNode CreateComment(string comment)
- {
- if (comment == null)
- {
- throw new ArgumentNullException("comment");
- }
- HtmlCommentNode c = CreateComment();
- c.Comment = comment;
- return c;
- }
- /// <summary>
- /// Creates an HTML text node.
- /// </summary>
- /// <returns>The new HTML text node.</returns>
- public HtmlTextNode CreateTextNode()
- {
- return (HtmlTextNode)CreateNode(HtmlNodeType.Text);
- }
- /// <summary>
- /// Creates an HTML text node with the specified text.
- /// </summary>
- /// <param name="text">The text of the node. May not be null.</param>
- /// <returns>The new HTML text node.</returns>
- public HtmlTextNode CreateTextNode(string text)
- {
- if (text == null)
- {
- throw new ArgumentNullException("text");
- }
- HtmlTextNode t = CreateTextNode();
- t.Text = text;
- return t;
- }
- internal HtmlNode CreateNode(HtmlNodeType type)
- {
- return CreateNode(type, -1);
- }
- internal HtmlNode CreateNode(HtmlNodeType type, int index)
- {
- switch (type)
- {
- case HtmlNodeType.Comment:
- return new HtmlCommentNode(this, index);
- case HtmlNodeType.Text:
- return new HtmlTextNode(this, index);
- default:
- return new HtmlNode(type, this, index);
- }
- }
- internal HtmlAttribute CreateAttribute()
- {
- return new HtmlAttribute(this);
- }
- /// <summary>
- /// Creates an HTML attribute with the specified name.
- /// </summary>
- /// <param name="name">The name of the attribute. May not be null.</param>
- /// <returns>The new HTML attribute.</returns>
- public HtmlAttribute CreateAttribute(string name)
- {
- if (name == null)
- {
- throw new ArgumentNullException("name");
- }
- HtmlAttribute att = CreateAttribute();
- att.Name = name;
- return att;
- }
- /// <summary>
- /// Creates an HTML attribute with the specified name.
- /// </summary>
- /// <param name="name">The name of the attribute. May not be null.</param>
- /// <param name="value">The value of the attribute.</param>
- /// <returns>The new HTML attribute.</returns>
- public HtmlAttribute CreateAttribute(string name, string value)
- {
- if (name == null)
- {
- throw new ArgumentNullException("name");
- }
- HtmlAttribute att = CreateAttribute(name);
- att.Value = value;
- return att;
- }
- /// <summary>
- /// Gets the root node of the document.
- /// </summary>
- public HtmlNode DocumentNode
- {
- get
- {
- return _documentnode;
- }
- }
- /// <summary>
- /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
- /// </summary>
- public int CheckSum
- {
- get
- {
- if (_crc32 == null)
- {
- return 0;
- }
- else
- {
- return (int)_crc32.CheckSum;
- }
- }
- }
- private HtmlParseError AddError(
- HtmlParseErrorCode code,
- int line,
- int linePosition,
- int streamPosition,
- string sourceText,
- string reason)
- {
- HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
- _parseerrors.Add(err);
- return err;
- }
- private enum ParseState
- {
- Text,
- WhichTag,
- Tag,
- BetweenAttributes,
- EmptyTag,
- AttributeName,
- AttributeBeforeEquals,
- AttributeAfterEquals,
- AttributeValue,
- Comment,
- QuotedAttributeValue,
- ServerSideCode,
- PcData
- }
- private void IncrementPosition()
- {
- if (_crc32 != null)
- {
- // REVIEW: should we add some checksum code in DecrementPosition too?
- _crc32.AddToCRC32(_c);
- }
- _index++;
- _maxlineposition = _lineposition;
- if (_c == 10)
- {
- _lineposition = 1;
- _line++;
- }
- else
- {
- _lineposition++;
- }
- }
- private void DecrementPosition()
- {
- _index--;
- if (_lineposition == 1)
- {
- _lineposition = _maxlineposition;
- _line--;
- }
- else
- {
- _lineposition--;
- }
- }
- private void Parse()
- {
- int lastquote = 0;
- if (OptionComputeChecksum)
- {
- _crc32 = new Crc32();
- }
- _lastnodes = new Hashtable();
- _c = 0;
- _fullcomment = false;
- _parseerrors = new ArrayList();
- _line = 1;
- _lineposition = 1;
- _maxlineposition = 1;
- _state = ParseState.Text;
- _oldstate = _state;
- _documentnode._innerlength = _text.Length;
- _documentnode._outerlength = _text.Length;
- _remainderOffset = _text.Length;
- _lastparentnode = _documentnode;
- _currentnode = CreateNode(HtmlNodeType.Text, 0);
- _currentattribute = null;
- _index = 0;
- PushNodeStart(HtmlNodeType.Text, 0);
- while (_index<_text.Length)
- {
- _c = _text[_index];
- IncrementPosition();
- switch(_state)
- {
- case ParseState.Text:
- if (NewCheck())
- continue;
- break;
- case ParseState.WhichTag:
- if (NewCheck())
- continue;
- if (_c == '/')
- {
- PushNodeNameStart(false, _index);
- }
- else
- {
- PushNodeNameStart(true, _index-1);
- DecrementPosition();
- }
- _state = ParseState.Tag;
- break;
- case ParseState.Tag:
- if (NewCheck())
- continue;
- if (IsWhiteSpace(_c))
- {
- PushNodeNameEnd(_index-1);
- if (_state != ParseState.Tag)
- continue;
- _state = ParseState.BetweenAttributes;
- continue;
- }
- if (_c == '/')
- {
- PushNodeNameEnd(_index-1);
- if (_state != ParseState.Tag)
- continue;
- _state = ParseState.EmptyTag;
- continue;
- }
- if (_c == '>')
- {
- PushNodeNameEnd(_index-1);
- if (_state != ParseState.Tag)
- continue;
- if (!PushNodeEnd(_index, false))
- {
- // stop parsing
- _index = _text.Length;
- break;
- }
- if (_state != ParseState.Tag)
- continue;
- _state = ParseState.Text;
- PushNodeStart(HtmlNodeType.Text, _index);
- }
- break;
- case ParseState.BetweenAttributes:
- if (NewCheck())
- continue;
- if (IsWhiteSpace(_c))
- continue;
- if ((_c == '/') || (_c == '?'))
- {
- _state = ParseState.EmptyTag;
- continue;
- }
- if (_c == '>')
- {
- if (!PushNodeEnd(_index, false))
- {
- // stop parsing
- _index = _text.Length;
- break;
- }
-
- if (_state != ParseState.BetweenAttributes)
- continue;
- _state = ParseState.Text;
- PushNodeStart(HtmlNodeType.Text, _index);
- continue;
- }
- PushAttributeNameStart(_index-1);
- _state = ParseState.AttributeName;
- break;
- case ParseState.EmptyTag:
- if (NewCheck())
- continue;
- if (_c == '>')
- {
- if (!PushNodeEnd(_index, true))
- {
- // stop parsing
- _index = _text.Length;
- break;
- }
- if (_state != ParseState.EmptyTag)
- continue;
- _state = ParseState.Text;
- PushNodeStart(HtmlNodeType.Text, _index);
- continue;
- }
- _state = ParseState.BetweenAttributes;
- break;
- case ParseState.AttributeName:
- if (NewCheck())
- continue;
- if (IsWhiteSpace(_c))
- {
- PushAttributeNameEnd(_index-1);
- _state = ParseState.AttributeBeforeEquals;
- continue;
- }
- if (_c == '=')
- {
- PushAttributeNameEnd(_index-1);
- _state = ParseState.AttributeAfterEquals;
- continue;
- }
- if (_c == '>')
- {
- PushAttributeNameEnd(_index-1);
- if (!PushNodeEnd(_index, false))
- {
- // stop parsing
- _index = _text.Length;
- break;
- }
- if (_state != ParseState.AttributeName)
- continue;
- _state = ParseState.Text;
- PushNodeStart(HtmlNodeType.Text, _index);
- continue;
- }
- break;
- case ParseState.AttributeBeforeEquals:
- if (NewCheck())
- continue;
- if (IsWhiteSpace(_c))
- continue;
- if (_c == '>')
- {
- if (!PushNodeEnd(_index, false))
- {
- // stop parsing
- _index = _text.Length;
- break;
- }
- if (_state != ParseState.AttributeBeforeEquals)
- continue;
- _state = ParseState.Text;
- PushNodeStart(HtmlNodeType.Text, _index);
- continue;
- }
- if (_c == '=')
- {
- _state = ParseState.AttributeAfterEquals;
- continue;
- }
- // no equals, no whitespace, it's a new attrribute starting
- _state = ParseState.BetweenAttributes;
- DecrementPosition();
- break;
- case ParseState.AttributeAfterEquals:
- if (NewCheck())
- continue;
- if (IsWhiteSpace(_c))
- continue;
- if ((_c == '\'') || (_c == '"'))
- {
- _state = ParseState.QuotedAttributeValue;
- PushAttributeValueStart(_index);
- lastquote = _c;
- continue;
- }
- if (_c == '>')
- {
- if (!PushNodeEnd(_index, false))
- {
- // stop parsing
- _index = _text.Length;
- break;
- }
- if (_state != ParseState.AttributeAfterEquals)
- continue;
- _state = ParseState.Text;
- PushNodeStart(HtmlNodeType.Text, _index);
- continue;
- }
- PushAttributeValueStart(_index-1);
- _state = ParseState.AttributeValue;
- break;
- case ParseState.AttributeValue:
- if (NewCheck())
- continue;
- if (IsWhiteSpace(_c))
- {
- PushAttributeValueEnd(_index-1);
- _state = ParseState.BetweenAttributes;
- continue;
- }
- if (_c == '>')
- {
- PushAttributeValueEnd(_index-1);
- if (!PushNodeEnd(_index, false))
- {
- // stop parsing
- _index = _text.Length;
- break;
- }
- if (_state != ParseState.AttributeValue)
- continue;
- _state = ParseState.Text;
- PushNodeStart(HtmlNodeType.Text, _index);
- continue;
- }
- break;
- case ParseState.QuotedAttributeValue:
- if (_c == lastquote)
- {
- PushAttributeValueEnd(_index-1);
- _state = ParseState.BetweenAttributes;
- continue;
- }
- if (_c == '<')
- {
- if (_index<_text.Length)
- {
- if (_text[_index] == '%')
- {
- _oldstate = _state;
- _state = ParseState.ServerSideCode;
- continue;
- }
- }
- }
- break;
- case ParseState.Comment:
- if (_c == '>')
- {
- if (_fullcomment)
- {
- if ((_text[_index-2] != '-') ||
- (_text[_index-3] != '-'))
- {
- continue;
- }
- }
- if (!PushNodeEnd(_index, false))
- {
- // stop parsing
- _index = _text.Length;
- break;
- }
- _state = ParseState.Text;
- PushNodeStart(HtmlNodeType.Text, _index);
- continue;
- }
- break;
- case ParseState.ServerSideCode:
- if (_c == '%')
- {
- if (_index<_text.Length)
- {
- if (_text[_index] == '>')
- {
- switch(_oldstate)
- {
- case ParseState.AttributeAfterEquals:
- _state = ParseState.AttributeValue;
- break;
- case ParseState.BetweenAttributes:
- PushAttributeNameEnd(_index+1);
- _state = ParseState.BetweenAttributes;
- break;
- default:
- _state = _oldstate;
- break;
- }
- IncrementPosition();
- }
- }
- }
- break;
- case ParseState.PcData:
- // look for </tag + 1 char
- // check buffer end
- if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
- {
- if (string.Compare(_text.Substring(_index-1, _currentnode._namelength+2),
- "</" + _currentnode.Name, true) == 0)
- {
- int c = _text[_index-1 + 2 + _currentnode.Name.Length];
- if ((c == '>') || (IsWhiteSpace(c)))
- {
- // add the script as a text node
- HtmlNode script = CreateNode(HtmlNodeType.Text,
- _currentnode._outerstartindex + _currentnode._outerlength);
- script._outerlength = _index-1 - script._outerstartindex;
- _currentnode.AppendChild(script);
- PushNodeStart(HtmlNodeType.Element, _index-1);
- PushNodeNameStart(false, _index-1 +2);
- _state = ParseState.Tag;
- IncrementPosition();
- }
- }
- }
- break;
- }
- }
- // finish the current work
- if (_currentnode._namestartindex > 0)
- {
- PushNodeNameEnd(_index);
- }
- PushNodeEnd(_index, false);
- // we don't need this anymore
- _lastnodes.Clear();
- }
- private bool NewCheck()
- {
- if (_c != '<')
- {
- return false;
- }
- if (_index<_text.Length)
- {
- if (_text[_index] == '%')
- {
- switch(_state)
- {
- case ParseState.AttributeAfterEquals:
- PushAttributeValueStart(_index-1);
- break;
- case ParseState.BetweenAttributes:
- PushAttributeNameStart(_index-1);
- break;
- case ParseState.WhichTag:
- PushNodeNameStart(true, _index-1);
- _state = ParseState.Tag;
- break;
- }
- _oldstate = _state;
- _state = ParseState.ServerSideCode;
- return true;
- }
- }
- if (!PushNodeEnd(_index-1, true))
- {
- // stop parsing
- _index = _text.Length;
- return true;
- }
- _state = ParseState.WhichTag;
- if ((_index-1) <= (_text.Length-2))
- {
- if (_text[_index] == '!')
- {
- PushNodeStart(HtmlNodeType.Comment, _index-1);
- PushNodeNameStart(true, _index);
- PushNodeNameEnd(_index+1);
- _state = ParseState.Comment;
- if (_index<(_text.Length-2))
- {
- if ((_text[_index+1] == '-') &&
- (_text[_index+2] == '-'))
- {
- _fullcomment = true;
- }
- else
- {
- _fullcomment = false;
- }
- }
- return true;
- }
- }
- PushNodeStart(HtmlNodeType.Element, _index-1);
- return true;
- }
- private void ReadDocumentEncoding(HtmlNode node)
- {
- if (!OptionReadEncoding)
- return;
- // format is
- // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
-
- // when we append a child, we are in node end, so attributes are already populated
- if (node._namelength == 4) // quick check, avoids string alloc
- {
- if (node.Name == "meta") // all nodes names are lowercase
- {
- HtmlAttribute att = node.Attributes["http-equiv"];
- if (att != null)
- {
- if (string.Compare(att.Value, "content-type", true) == 0)
- {
- HtmlAttribute content = node.Attributes["content"];
- if (content != null)
- {
- string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
- if (charset != null)
- {
- _declaredencoding = Encoding.GetEncoding(charset);
- if (_onlyDetectEncoding)
- {
- throw new EncodingFoundException(_declaredencoding);
- }
- if (_streamencoding != null)
- {
- if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
- {
- AddError(
- HtmlParseErrorCode.CharsetMismatch,
- _line, _lineposition,
- _index, node.OuterHtml,
- "Encoding mismatch between StreamEncoding: " +
- _streamencoding.WebName + " and DeclaredEncoding: " + _declaredencoding.WebName);
- }
- }
- }
- }
- }
- }
- }
- }
- }
- private void PushAttributeNameStart(int index)
- {
- _currentattribute = CreateAttribute();
- _currentattribute._namestartindex = index;
- _currentattribute._line = _line;
- _currentattribute._lineposition = _lineposition;
- _currentattribute._streamposition = index;
- }
- private void PushAttributeNameEnd(int index)
- {
- _currentattribute._namelength = index - _currentattribute._namestartindex;
- _currentnode.Attributes.Append(_currentattribute);
- }
- private void PushAttributeValueStart(int index)
- {
- _currentattribute._valuestartindex = index;
- }
- private void PushAttributeValueEnd(int index)
- {
- _currentattribute._valuelength = index - _currentattribute._valuestartindex;
- }
- private void PushNodeStart(HtmlNodeType type, int index)
- {
- _currentnode = CreateNode(type, index);
- _currentnode._line = _line;
- _currentnode._lineposition = _lineposition;
- if (type == HtmlNodeType.Element)
- {
- _currentnode._lineposition--;
- }
- _currentnode._streamposition = index;
- }
- private bool PushNodeEnd(int index, bool close)
- {
- _currentnode._outerlength = index - _currentnode._outerstartindex;
- if ((_currentnode._nodetype == HtmlNodeType.Text) ||
- (_currentnode._nodetype == HtmlNodeType.Comment))
- {
- // forget about void nodes
- if (_currentnode._outerlength>0)
- {
- _currentnode._innerlength = _currentnode._outerlength;
- _currentnode._innerstartindex = _currentnode._outerstartindex;
- if (_lastparentnode != null)
- {
- _lastparentnode.AppendChild(_currentnode);
- }
- }
- }
- else
- {
- if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
- {
- // add to parent node
- if (_lastparentnode != null)
- {
- _lastparentnode.AppendChild(_currentnode);
- }
- ReadDocumentEncoding(_currentnode);
- // remember last node of this kind
- HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
- _currentnode._prevwithsamename = prev;
- _lastnodes[_currentnode.Name] = _currentnode;
- // change parent?
- if ((_currentnode.NodeType == HtmlNodeType.Document) ||
- (_currentnode.NodeType == HtmlNodeType.Element))
- {
- _lastparentnode = _currentnode;
- }
- if (HtmlNode.IsCDataElement(CurrentNodeName()))
- {
- _state = ParseState.PcData;
- return true;
- }
- if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
- (HtmlNode.IsEmptyElement(_currentnode.Name)))
- {
- close = true;
- }
- }
- }
- if ((close) || (!_currentnode._starttag))
- {
- if ((OptionStopperNodeName != null) && (_remainder == null) &&
- (string.Compare(_currentnode.Name, OptionStopperNodeName, true) == 0))
- {
- _remainderOffset = index;
- _remainder = _text.Substring(_remainderOffset);
- CloseCurrentNode();
- return false; // stop parsing
- }
- CloseCurrentNode();
- }
- return true;
- }
- private void PushNodeNameStart(bool starttag, int index)
- {
- _currentnode._starttag = starttag;
- _currentnode._namestartindex = index;
- }
- private string[] GetResetters(string name)
- {
- switch (name)
- {
- case "li":
- return new string[]{"ul"};
- case "tr":
- return new string[]{"table"};
- case "th":
- case "td":
- return new string[]{"tr", "table"};
- default:
- return null;
- }
- }
- private void FixNestedTags()
- {
- // we are only interested by start tags, not closing tags
- if (!_currentnode._starttag)
- return;
- string name = CurrentNodeName().ToLower();
- FixNestedTag(name, GetResetters(name));
- }
-
- private void FixNestedTag(string name, string[] resetters)
- {
- if (resetters == null)
- return;
- HtmlNode prev;
-
- // if we find a previous unclosed same name node, without a resetter node between, we must close it
- prev = (HtmlNode)_lastnodes[name];
- if ((prev != null) && (!prev.Closed))
- {
- // try to find a resetter node, if found, we do nothing
- if (FindResetterNodes(prev, resetters))
- {
- return;
- }
- // ok we need to close the prev now
- // create a fake closer node
- HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
- close._endnode = close;
- prev.CloseNode(close);
- }
- }
- private bool FindResetterNodes(HtmlNode node, string[] names)
- {
- if (names == null)
- {
- return false;
- }
- for(int i=0;i<names.Length;i++)
- {
- if (FindResetterNode(node, names[i]) != null)
- {
- return true;
- }
- }
- return false;
- }
- private HtmlNode FindResetterNode(HtmlNode node, string name)
- {
- HtmlNode resetter = (HtmlNode)_lastnodes[name];
- if (resetter == null)
- return null;
- if (resetter.Closed)
- {
- return null;
- }
- if (resetter._streamposition<node._streamposition)
- {
- return null;
- }
- return resetter;
- }
- private void PushNodeNameEnd(int index)
- {
- _currentnode._namelength = index - _currentnode._namestartindex;
- if (OptionFixNestedTags)
- {
- FixNestedTags();
- }
- }
- private void CloseCurrentNode()
- {
- if (_currentnode.Closed) // text or document are by def closed
- return;
- bool error = false;
- // find last node of this kind
- HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
- if (prev == null)
- {
- if (HtmlNode.IsClosedElement(_currentnode.Name))
- {
- // </br> will be seen as <br>
- _currentnode.CloseNode(_currentnode);
- // add to parent node
- if (_lastparentnode != null)
- {
- HtmlNode foundNode = null;
- Stack futureChild = new Stack();
- for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
- {
- if ((node.Name == _currentnode.Name) && (! node.HasChildNodes))
- {
- foundNode = node;
- break;
- }
- futureChild.Push(node);
- }
- if (foundNode != null)
- {
- HtmlNode node = null;
- while(futureChild.Count != 0)
- {
- node = (HtmlNode)futureChild.Pop();
- _lastparentnode.RemoveChild(node);
- foundNode.AppendChild(node);
- }
- }
- else
- {
- _lastparentnode.AppendChild(_currentnode);
- }
- }
- }
- else
- {
- // node has no parent
- // node is not a closed node
- if (HtmlNode.CanOverlapElement(_currentnode.Name))
- {
- // this is a hack: add it as a text node
- HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
- closenode._outerlength = _currentnode._outerlength;
- ((HtmlTextNode)closenode).Text = ((HtmlTextNode)closenode).Text.ToLower();
- if (_lastparentnode != null)
- {
- _lastparentnode.AppendChild(closenode);
- }
- }
- else
- {
- if (HtmlNode.IsEmptyElement(_currentnode.Name))
- {
- AddError(
- HtmlParseErrorCode.EndTagNotRequired,
- _currentnode._line, _currentnode._lineposition,
- _currentnode._streamposition, _currentnode.OuterHtml,
- "End tag </" + _currentnode.Name + "> is not required");
- }
- else
- {
- // node cannot overlap, node is not empty
- AddError(
- HtmlParseErrorCode.TagNotOpened,
- _currentnode._line, _currentnode._lineposition,
- _currentnode._streamposition, _currentnode.OuterHtml,
- "Start tag <" + _currentnode.Name + "> was not found");
- error = true;
- }
- }
- }
- }
- else
- {
- if (OptionFixNestedTags)
- {
- if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
- {
- AddError(
- HtmlParseErrorCode.EndTagInvalidHere,
- _currentnode._line, _currentnode._lineposition,
- _currentnode._streamposition, _currentnode.OuterHtml,
- "End tag </" + _currentnode.Name + "> invalid here");
- error = true;
- }
- }
- if (!error)
- {
- _lastnodes[_currentnode.Name] = prev._prevwithsamename;
- prev.CloseNode(_currentnode);
- }
- }
- // we close this node, get grandparent
- if (!error)
- {
- if ((_lastparentnode != null) &&
- ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
- (_currentnode._starttag)))
- {
- UpdateLastParentNode();
- }
- }
- }
- internal void UpdateLastParentNode()
- {
- do
- {
- if (_lastparentnode.Closed)
- {
- _lastparentnode = _lastparentnode.ParentNode;
- }
- }
- while ((_lastparentnode != null) && (_lastparentnode.Closed));
- if (_lastparentnode == null)
- {
- _lastparentnode = _documentnode;
- }
- }
-
- private string CurrentAttributeName()
- {
- return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
- }
- private string CurrentAttributeValue()
- {
- return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
- }
- private string CurrentNodeName()
- {
- return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
- }
- private string CurrentNodeOuter()
- {
- return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
- }
- private string CurrentNodeInner()
- {
- return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
- }
- /// <summary>
- /// Determines if the specified character is considered as a whitespace character.
- /// </summary>
- /// <param name="c">The character to check.</param>
- /// <returns>true if if the specified character is considered as a whitespace character.</returns>
- public static bool IsWhiteSpace(int c)
- {
- if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
- {
- return true;
- }
- return false;
- }
- }
- internal class EncodingFoundException: Exception
- {
- private Encoding _encoding;
- internal EncodingFoundException(Encoding encoding)
- {
- _encoding = encoding;
- }
- internal Encoding Encoding
- {
- get
- {
- return _encoding;
- }
- }
- }
- }
|