HtmlDocument.cs 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079
  1. // HtmlAgilityPack V1.0 - Simon Mourier <[email protected]>
  2. using System;
  3. using System.IO;
  4. using System.Text;
  5. using System.Diagnostics;
  6. using System.Collections;
  7. using System.Text.RegularExpressions;
  8. using System.Xml;
  9. using System.Xml.XPath;
  10. namespace HtmlAgilityPack
  11. {
  12. /// <summary>
  13. /// Represents the type of parsing error.
  14. /// </summary>
  15. public enum HtmlParseErrorCode
  16. {
  17. /// <summary>
  18. /// A tag was not closed.
  19. /// </summary>
  20. TagNotClosed,
  21. /// <summary>
  22. /// A tag was not opened.
  23. /// </summary>
  24. TagNotOpened,
  25. /// <summary>
  26. /// There is a charset mismatch between stream and declared (META) encoding.
  27. /// </summary>
  28. CharsetMismatch,
  29. /// <summary>
  30. /// An end tag was not required.
  31. /// </summary>
  32. EndTagNotRequired,
  33. /// <summary>
  34. /// An end tag is invalid at this position.
  35. /// </summary>
  36. EndTagInvalidHere
  37. }
  38. /// <summary>
  39. /// Represents a parsing error found during document parsing.
  40. /// </summary>
  41. public class HtmlParseError
  42. {
  43. private HtmlParseErrorCode _code;
  44. private int _line;
  45. private int _linePosition;
  46. private int _streamPosition;
  47. private string _sourceText;
  48. private string _reason;
  49. internal HtmlParseError(
  50. HtmlParseErrorCode code,
  51. int line,
  52. int linePosition,
  53. int streamPosition,
  54. string sourceText,
  55. string reason)
  56. {
  57. _code = code;
  58. _line = line;
  59. _linePosition = linePosition;
  60. _streamPosition = streamPosition;
  61. _sourceText = sourceText;
  62. _reason = reason;
  63. }
  64. /// <summary>
  65. /// Gets the type of error.
  66. /// </summary>
  67. public HtmlParseErrorCode Code
  68. {
  69. get
  70. {
  71. return _code;
  72. }
  73. }
  74. /// <summary>
  75. /// Gets the line number of this error in the document.
  76. /// </summary>
  77. public int Line
  78. {
  79. get
  80. {
  81. return _line;
  82. }
  83. }
  84. /// <summary>
  85. /// Gets the column number of this error in the document.
  86. /// </summary>
  87. public int LinePosition
  88. {
  89. get
  90. {
  91. return _linePosition;
  92. }
  93. }
  94. /// <summary>
  95. /// Gets the absolute stream position of this error in the document, relative to the start of the document.
  96. /// </summary>
  97. public int StreamPosition
  98. {
  99. get
  100. {
  101. return _streamPosition;
  102. }
  103. }
  104. /// <summary>
  105. /// Gets the the full text of the line containing the error.
  106. /// </summary>
  107. public string SourceText
  108. {
  109. get
  110. {
  111. return _sourceText;
  112. }
  113. }
  114. /// <summary>
  115. /// Gets a description for the error.
  116. /// </summary>
  117. public string Reason
  118. {
  119. get
  120. {
  121. return _reason;
  122. }
  123. }
  124. }
  125. /// <summary>
  126. /// Represents a complete HTML document.
  127. /// </summary>
  128. public class HtmlDocument: IXPathNavigable
  129. {
  130. internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
  131. internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature";
  132. internal Hashtable _openednodes;
  133. internal Hashtable _lastnodes = new Hashtable();
  134. internal Hashtable _nodesid;
  135. private HtmlNode _documentnode;
  136. internal string _text;
  137. private string _remainder;
  138. private int _remainderOffset;
  139. private HtmlNode _currentnode;
  140. private HtmlNode _lastparentnode;
  141. private HtmlAttribute _currentattribute;
  142. private int _index;
  143. private int _line;
  144. private int _lineposition, _maxlineposition;
  145. private int _c;
  146. private bool _fullcomment;
  147. private System.Text.Encoding _streamencoding;
  148. private System.Text.Encoding _declaredencoding;
  149. private ArrayList _parseerrors = new ArrayList();
  150. private ParseState _state, _oldstate;
  151. private Crc32 _crc32 = null;
  152. private bool _onlyDetectEncoding = false;
  153. // public props
  154. /// <summary>
  155. /// Defines if a checksum must be computed for the document while parsing. Default is false.
  156. /// </summary>
  157. public bool OptionComputeChecksum = false;
  158. /// <summary>
  159. /// Defines if declared encoding must be read from the document.
  160. /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
  161. /// Default is true.
  162. /// </summary>
  163. public bool OptionReadEncoding = true;
  164. /// <summary>
  165. /// Defines if non closed nodes will be checked at the end of parsing. Default is true.
  166. /// </summary>
  167. public bool OptionCheckSyntax = true;
  168. /// <summary>
  169. /// Defines if the 'id' attribute must be specifically used. Default is true.
  170. /// </summary>
  171. public bool OptionUseIdAttribute = true;
  172. /// <summary>
  173. /// Defines if empty nodes must be written as closed during output. Default is false.
  174. /// </summary>
  175. public bool OptionWriteEmptyNodes = false;
  176. /// <summary>
  177. /// Defines if output must conform to XML, instead of HTML.
  178. /// </summary>
  179. public bool OptionOutputAsXml = false;
  180. /// <summary>
  181. /// Defines if name must be output in uppercase. Default is false.
  182. /// </summary>
  183. public bool OptionOutputUpperCase = false;
  184. /// <summary>
  185. /// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
  186. /// </summary>
  187. public bool OptionOutputOptimizeAttributeValues = false;
  188. /// <summary>
  189. /// Adds Debugging attributes to node. Default is false.
  190. /// </summary>
  191. public bool OptionAddDebuggingAttributes = false;
  192. /// <summary>
  193. /// Defines if source text must be extracted while parsing errors.
  194. /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
  195. /// Default is false.
  196. /// </summary>
  197. public bool OptionExtractErrorSourceText = false; // turning this on can dramatically slow performance if a lot of errors are detected
  198. /// <summary>
  199. /// Defines if closing for non closed nodes must be done at the end or directly in the document.
  200. /// Setting this to true can actually change how browsers render the page. Default is false.
  201. /// </summary>
  202. public bool OptionAutoCloseOnEnd = false; // close errors at the end
  203. /// <summary>
  204. /// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
  205. /// </summary>
  206. public bool OptionFixNestedTags = false; // fix li, tr, th, td tags
  207. /// <summary>
  208. /// Defines the maximum length of source text or parse errors. Default is 100.
  209. /// </summary>
  210. public int OptionExtractErrorSourceTextMaxLength = 100;
  211. /// <summary>
  212. /// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
  213. /// </summary>
  214. public System.Text.Encoding OptionDefaultStreamEncoding = System.Text.Encoding.Default;
  215. /// <summary>
  216. /// Defines the name of a node that will throw the StopperNodeException when found as an end node. Default is null.
  217. /// </summary>
  218. public string OptionStopperNodeName = null;
  219. /// <summary>
  220. /// Gets the remaining text.
  221. /// Will always be null if OptionStopperNodeName is null.
  222. /// </summary>
  223. public string Remainder
  224. {
  225. get
  226. {
  227. return _remainder;
  228. }
  229. }
  230. /// <summary>
  231. /// Gets the offset of Remainder in the original Html text.
  232. /// If OptionStopperNodeName is null, this will return the length of the original Html text.
  233. /// </summary>
  234. public int RemainderOffset
  235. {
  236. get
  237. {
  238. return _remainderOffset;
  239. }
  240. }
  241. /// <summary>
  242. /// Gets a list of parse errors found in the document.
  243. /// </summary>
  244. public ArrayList ParseErrors
  245. {
  246. get
  247. {
  248. return _parseerrors;
  249. }
  250. }
  251. /// <summary>
  252. /// Gets the document's stream encoding.
  253. /// </summary>
  254. public System.Text.Encoding StreamEncoding
  255. {
  256. get
  257. {
  258. return _streamencoding;
  259. }
  260. }
  261. /// <summary>
  262. /// Gets the document's declared encoding.
  263. /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
  264. /// </summary>
  265. public System.Text.Encoding DeclaredEncoding
  266. {
  267. get
  268. {
  269. return _declaredencoding;
  270. }
  271. }
  272. /// <summary>
  273. /// Creates an instance of an HTML document.
  274. /// </summary>
  275. public HtmlDocument()
  276. {
  277. _documentnode = CreateNode(HtmlNodeType.Document, 0);
  278. }
  279. internal HtmlNode GetXmlDeclaration()
  280. {
  281. if (!_documentnode.HasChildNodes)
  282. {
  283. return null;
  284. }
  285. foreach(HtmlNode node in _documentnode._childnodes)
  286. {
  287. if (node.Name == "?xml") // it's ok, names are case sensitive
  288. {
  289. return node;
  290. }
  291. }
  292. return null;
  293. }
  294. /// <summary>
  295. /// Applies HTML encoding to a specified string.
  296. /// </summary>
  297. /// <param name="html">The input string to encode. May not be null.</param>
  298. /// <returns>The encoded string.</returns>
  299. public static string HtmlEncode(string html)
  300. {
  301. if (html == null)
  302. {
  303. throw new ArgumentNullException("html");
  304. }
  305. // replace & by &amp; but only once!
  306. Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
  307. return rx.Replace(html, "&amp;").Replace("<", "&lt;").Replace(">", "&gt;").Replace("\"", "&quot;");
  308. }
  309. /// <summary>
  310. /// Detects the encoding of an HTML stream.
  311. /// </summary>
  312. /// <param name="stream">The input stream. May not be null.</param>
  313. /// <returns>The detected encoding.</returns>
  314. public Encoding DetectEncoding(Stream stream)
  315. {
  316. if (stream == null)
  317. {
  318. throw new ArgumentNullException("stream");
  319. }
  320. return DetectEncoding(new StreamReader(stream));
  321. }
  322. /// <summary>
  323. /// Detects the encoding of an HTML file.
  324. /// </summary>
  325. /// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
  326. /// <returns>The detected encoding.</returns>
  327. public Encoding DetectEncoding(string path)
  328. {
  329. if (path == null)
  330. {
  331. throw new ArgumentNullException("path");
  332. }
  333. StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
  334. Encoding encoding = DetectEncoding(sr);
  335. sr.Close();
  336. return encoding;
  337. }
  338. /// <summary>
  339. /// Detects the encoding of an HTML text.
  340. /// </summary>
  341. /// <param name="html">The input html text. May not be null.</param>
  342. /// <returns>The detected encoding.</returns>
  343. public Encoding DetectEncodingHtml(string html)
  344. {
  345. if (html == null)
  346. {
  347. throw new ArgumentNullException("html");
  348. }
  349. StringReader sr = new StringReader(html);
  350. Encoding encoding = DetectEncoding(sr);
  351. sr.Close();
  352. return encoding;
  353. }
  354. /// <summary>
  355. /// Detects the encoding of an HTML text provided on a TextReader.
  356. /// </summary>
  357. /// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
  358. /// <returns>The detected encoding.</returns>
  359. public Encoding DetectEncoding(TextReader reader)
  360. {
  361. if (reader == null)
  362. {
  363. throw new ArgumentNullException("reader");
  364. }
  365. _onlyDetectEncoding = true;
  366. if (OptionCheckSyntax)
  367. {
  368. _openednodes = new Hashtable();
  369. }
  370. else
  371. {
  372. _openednodes = null;
  373. }
  374. if (OptionUseIdAttribute)
  375. {
  376. _nodesid = new Hashtable();
  377. }
  378. else
  379. {
  380. _nodesid = null;
  381. }
  382. StreamReader sr = reader as StreamReader;
  383. if (sr != null)
  384. {
  385. _streamencoding = sr.CurrentEncoding;
  386. }
  387. else
  388. {
  389. _streamencoding = null;
  390. }
  391. _declaredencoding = null;
  392. _text = reader.ReadToEnd();
  393. _documentnode = CreateNode(HtmlNodeType.Document, 0);
  394. // this is almost a hack, but it allows us not to muck with the original parsing code
  395. try
  396. {
  397. Parse();
  398. }
  399. catch(EncodingFoundException ex)
  400. {
  401. return ex.Encoding;
  402. }
  403. return null;
  404. }
  405. /// <summary>
  406. /// Loads an HTML document from a stream.
  407. /// </summary>
  408. /// <param name="stream">The input stream.</param>
  409. public void Load(Stream stream)
  410. {
  411. Load(new StreamReader(stream, OptionDefaultStreamEncoding));
  412. }
  413. /// <summary>
  414. /// Loads an HTML document from a stream.
  415. /// </summary>
  416. /// <param name="stream">The input stream.</param>
  417. /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
  418. public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
  419. {
  420. Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
  421. }
  422. /// <summary>
  423. /// Loads an HTML document from a stream.
  424. /// </summary>
  425. /// <param name="stream">The input stream.</param>
  426. /// <param name="encoding">The character encoding to use.</param>
  427. public void Load(Stream stream, Encoding encoding)
  428. {
  429. Load(new StreamReader(stream, encoding));
  430. }
  431. /// <summary>
  432. /// Loads an HTML document from a stream.
  433. /// </summary>
  434. /// <param name="stream">The input stream.</param>
  435. /// <param name="encoding">The character encoding to use.</param>
  436. /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
  437. public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
  438. {
  439. Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
  440. }
  441. /// <summary>
  442. /// Loads an HTML document from a stream.
  443. /// </summary>
  444. /// <param name="stream">The input stream.</param>
  445. /// <param name="encoding">The character encoding to use.</param>
  446. /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
  447. /// <param name="buffersize">The minimum buffer size.</param>
  448. public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
  449. {
  450. Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
  451. }
  452. /// <summary>
  453. /// Loads an HTML document from a file.
  454. /// </summary>
  455. /// <param name="path">The complete file path to be read. May not be null.</param>
  456. public void Load(string path)
  457. {
  458. if (path == null)
  459. {
  460. throw new ArgumentNullException("path");
  461. }
  462. StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
  463. Load(sr);
  464. sr.Close();
  465. }
  466. /// <summary>
  467. /// Loads an HTML document from a file.
  468. /// </summary>
  469. /// <param name="path">The complete file path to be read. May not be null.</param>
  470. /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
  471. public void Load(string path, bool detectEncodingFromByteOrderMarks)
  472. {
  473. if (path == null)
  474. {
  475. throw new ArgumentNullException("path");
  476. }
  477. StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
  478. Load(sr);
  479. sr.Close();
  480. }
  481. /// <summary>
  482. /// Loads an HTML document from a file.
  483. /// </summary>
  484. /// <param name="path">The complete file path to be read. May not be null.</param>
  485. /// <param name="encoding">The character encoding to use. May not be null.</param>
  486. public void Load(string path, Encoding encoding)
  487. {
  488. if (path == null)
  489. {
  490. throw new ArgumentNullException("path");
  491. }
  492. if (encoding == null)
  493. {
  494. throw new ArgumentNullException("encoding");
  495. }
  496. StreamReader sr = new StreamReader(path, encoding);
  497. Load(sr);
  498. sr.Close();
  499. }
  500. /// <summary>
  501. /// Loads an HTML document from a file.
  502. /// </summary>
  503. /// <param name="path">The complete file path to be read. May not be null.</param>
  504. /// <param name="encoding">The character encoding to use. May not be null.</param>
  505. /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
  506. public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
  507. {
  508. if (path == null)
  509. {
  510. throw new ArgumentNullException("path");
  511. }
  512. if (encoding == null)
  513. {
  514. throw new ArgumentNullException("encoding");
  515. }
  516. StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
  517. Load(sr);
  518. sr.Close();
  519. }
  520. /// <summary>
  521. /// Loads an HTML document from a file.
  522. /// </summary>
  523. /// <param name="path">The complete file path to be read. May not be null.</param>
  524. /// <param name="encoding">The character encoding to use. May not be null.</param>
  525. /// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
  526. /// <param name="buffersize">The minimum buffer size.</param>
  527. public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
  528. {
  529. if (path == null)
  530. {
  531. throw new ArgumentNullException("path");
  532. }
  533. if (encoding == null)
  534. {
  535. throw new ArgumentNullException("encoding");
  536. }
  537. StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
  538. Load(sr);
  539. sr.Close();
  540. }
  541. /// <summary>
  542. /// Loads the HTML document from the specified string.
  543. /// </summary>
  544. /// <param name="html">String containing the HTML document to load. May not be null.</param>
  545. public void LoadHtml(string html)
  546. {
  547. if (html == null)
  548. {
  549. throw new ArgumentNullException("html");
  550. }
  551. StringReader sr = new StringReader(html);
  552. Load(sr);
  553. sr.Close();
  554. }
  555. /// <summary>
  556. /// Detects the encoding of an HTML document from a file first, and then loads the file.
  557. /// </summary>
  558. /// <param name="path">The complete file path to be read.</param>
  559. public void DetectEncodingAndLoad(string path)
  560. {
  561. DetectEncodingAndLoad(path, true);
  562. }
  563. /// <summary>
  564. /// Detects the encoding of an HTML document from a file first, and then loads the file.
  565. /// </summary>
  566. /// <param name="path">The complete file path to be read. May not be null.</param>
  567. /// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
  568. public void DetectEncodingAndLoad(string path, bool detectEncoding)
  569. {
  570. if (path == null)
  571. {
  572. throw new ArgumentNullException("path");
  573. }
  574. System.Text.Encoding enc;
  575. if (detectEncoding)
  576. {
  577. enc = DetectEncoding(path);
  578. }
  579. else
  580. {
  581. enc = null;
  582. }
  583. if (enc == null)
  584. {
  585. Load(path);
  586. }
  587. else
  588. {
  589. Load(path, enc);
  590. }
  591. }
  592. /// <summary>
  593. /// Loads the HTML document from the specified TextReader.
  594. /// </summary>
  595. /// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
  596. public void Load(TextReader reader)
  597. {
  598. // all Load methods pass down to this one
  599. if (reader == null)
  600. {
  601. throw new ArgumentNullException("reader");
  602. }
  603. _onlyDetectEncoding = false;
  604. if (OptionCheckSyntax)
  605. {
  606. _openednodes = new Hashtable();
  607. }
  608. else
  609. {
  610. _openednodes = null;
  611. }
  612. if (OptionUseIdAttribute)
  613. {
  614. _nodesid = new Hashtable();
  615. }
  616. else
  617. {
  618. _nodesid = null;
  619. }
  620. StreamReader sr = reader as StreamReader;
  621. if (sr != null)
  622. {
  623. try
  624. {
  625. // trigger bom read if needed
  626. sr.Peek();
  627. }
  628. catch
  629. {
  630. // void on purpose
  631. }
  632. _streamencoding = sr.CurrentEncoding;
  633. }
  634. else
  635. {
  636. _streamencoding = null;
  637. }
  638. _declaredencoding = null;
  639. _text = reader.ReadToEnd();
  640. _documentnode = CreateNode(HtmlNodeType.Document, 0);
  641. Parse();
  642. if (OptionCheckSyntax)
  643. {
  644. foreach(HtmlNode node in _openednodes.Values)
  645. {
  646. if (!node._starttag) // already reported
  647. {
  648. continue;
  649. }
  650. string html;
  651. if (OptionExtractErrorSourceText)
  652. {
  653. html = node.OuterHtml;
  654. if (html.Length > OptionExtractErrorSourceTextMaxLength)
  655. {
  656. html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
  657. }
  658. }
  659. else
  660. {
  661. html = string.Empty;
  662. }
  663. AddError(
  664. HtmlParseErrorCode.TagNotClosed,
  665. node._line, node._lineposition,
  666. node._streamposition, html,
  667. "End tag </" + node.Name + "> was not found");
  668. }
  669. // we don't need this anymore
  670. _openednodes.Clear();
  671. }
  672. }
  673. internal System.Text.Encoding GetOutEncoding()
  674. {
  675. // when unspecified, use the stream encoding first
  676. if (_declaredencoding != null)
  677. {
  678. return _declaredencoding;
  679. }
  680. else
  681. {
  682. if (_streamencoding != null)
  683. {
  684. return _streamencoding;
  685. }
  686. }
  687. return OptionDefaultStreamEncoding;
  688. }
  689. /// <summary>
  690. /// Gets the document's output encoding.
  691. /// </summary>
  692. public System.Text.Encoding Encoding
  693. {
  694. get
  695. {
  696. return GetOutEncoding();
  697. }
  698. }
  699. /// <summary>
  700. /// Saves the HTML document to the specified stream.
  701. /// </summary>
  702. /// <param name="outStream">The stream to which you want to save.</param>
  703. public void Save(Stream outStream)
  704. {
  705. StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
  706. Save(sw);
  707. }
  708. /// <summary>
  709. /// Saves the HTML document to the specified stream.
  710. /// </summary>
  711. /// <param name="outStream">The stream to which you want to save. May not be null.</param>
  712. /// <param name="encoding">The character encoding to use. May not be null.</param>
  713. public void Save(Stream outStream, System.Text.Encoding encoding)
  714. {
  715. if (outStream == null)
  716. {
  717. throw new ArgumentNullException("outStream");
  718. }
  719. if (encoding == null)
  720. {
  721. throw new ArgumentNullException("encoding");
  722. }
  723. StreamWriter sw = new StreamWriter(outStream, encoding);
  724. Save(sw);
  725. }
  726. /// <summary>
  727. /// Saves the mixed document to the specified file.
  728. /// </summary>
  729. /// <param name="filename">The location of the file where you want to save the document.</param>
  730. public void Save(string filename)
  731. {
  732. StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
  733. Save(sw);
  734. sw.Close();
  735. }
  736. /// <summary>
  737. /// Saves the mixed document to the specified file.
  738. /// </summary>
  739. /// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
  740. /// <param name="encoding">The character encoding to use. May not be null.</param>
  741. public void Save(string filename, System.Text.Encoding encoding)
  742. {
  743. if (filename == null)
  744. {
  745. throw new ArgumentNullException("filename");
  746. }
  747. if (encoding == null)
  748. {
  749. throw new ArgumentNullException("encoding");
  750. }
  751. StreamWriter sw = new StreamWriter(filename, false, encoding);
  752. Save(sw);
  753. sw.Close();
  754. }
  755. /// <summary>
  756. /// Saves the HTML document to the specified StreamWriter.
  757. /// </summary>
  758. /// <param name="writer">The StreamWriter to which you want to save.</param>
  759. public void Save(StreamWriter writer)
  760. {
  761. Save((TextWriter)writer);
  762. }
  763. /// <summary>
  764. /// Saves the HTML document to the specified TextWriter.
  765. /// </summary>
  766. /// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
  767. public void Save(TextWriter writer)
  768. {
  769. if (writer == null)
  770. {
  771. throw new ArgumentNullException("writer");
  772. }
  773. DocumentNode.WriteTo(writer);
  774. }
  775. /// <summary>
  776. /// Saves the HTML document to the specified XmlWriter.
  777. /// </summary>
  778. /// <param name="writer">The XmlWriter to which you want to save.</param>
  779. public void Save(XmlWriter writer)
  780. {
  781. DocumentNode.WriteTo(writer);
  782. writer.Flush();
  783. }
  784. /// <summary>
  785. /// Creates a new XPathNavigator object for navigating this HTML document.
  786. /// </summary>
  787. /// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
  788. public XPathNavigator CreateNavigator()
  789. {
  790. return new HtmlNodeNavigator(this, _documentnode);
  791. }
  792. /// <summary>
  793. /// Gets a valid XML name.
  794. /// </summary>
  795. /// <param name="name">Any text.</param>
  796. /// <returns>A string that is a valid XML name.</returns>
  797. public static string GetXmlName(string name)
  798. {
  799. string xmlname = string.Empty;
  800. bool nameisok = true;
  801. for(int i=0;i<name.Length;i++)
  802. {
  803. // names are lcase
  804. // note: we are very limited here, too much?
  805. if (((name[i]>='a') && (name[i]<='z')) ||
  806. ((name[i]>='0') && (name[i]<='9')) ||
  807. // (name[i]==':') || (name[i]=='_') || (name[i]=='-') || (name[i]=='.')) // these are bads in fact
  808. (name[i]=='_') || (name[i]=='-') || (name[i]=='.'))
  809. {
  810. xmlname += name[i];
  811. }
  812. else
  813. {
  814. nameisok = false;
  815. byte[] bytes = System.Text.Encoding.UTF8.GetBytes(new char[]{name[i]});
  816. for(int j=0;j<bytes.Length;j++)
  817. {
  818. xmlname += bytes[j].ToString("x2");
  819. }
  820. xmlname += "_";
  821. }
  822. }
  823. if (nameisok)
  824. {
  825. return xmlname;
  826. }
  827. return "_" + xmlname;
  828. }
  829. internal void SetIdForNode(HtmlNode node, string id)
  830. {
  831. if (!OptionUseIdAttribute)
  832. {
  833. return;
  834. }
  835. if ((_nodesid == null) || (id == null))
  836. {
  837. return;
  838. }
  839. if (node == null)
  840. {
  841. _nodesid.Remove(id.ToLower());
  842. }
  843. else
  844. {
  845. _nodesid[id.ToLower()] = node;
  846. }
  847. }
  848. /// <summary>
  849. /// Gets the HTML node with the specified 'id' attribute value.
  850. /// </summary>
  851. /// <param name="id">The attribute id to match. May not be null.</param>
  852. /// <returns>The HTML node with the matching id or null if not found.</returns>
  853. public HtmlNode GetElementbyId(string id)
  854. {
  855. if (id == null)
  856. {
  857. throw new ArgumentNullException("id");
  858. }
  859. if (_nodesid == null)
  860. {
  861. throw new Exception(HtmlExceptionUseIdAttributeFalse);
  862. }
  863. return _nodesid[id.ToLower()] as HtmlNode;
  864. }
  865. /// <summary>
  866. /// Creates an HTML element node with the specified name.
  867. /// </summary>
  868. /// <param name="name">The qualified name of the element. May not be null.</param>
  869. /// <returns>The new HTML node.</returns>
  870. public HtmlNode CreateElement(string name)
  871. {
  872. if (name == null)
  873. {
  874. throw new ArgumentNullException("name");
  875. }
  876. HtmlNode node = CreateNode(HtmlNodeType.Element);
  877. node._name = name;
  878. return node;
  879. }
  880. /// <summary>
  881. /// Creates an HTML comment node.
  882. /// </summary>
  883. /// <returns>The new HTML comment node.</returns>
  884. public HtmlCommentNode CreateComment()
  885. {
  886. return (HtmlCommentNode)CreateNode(HtmlNodeType.Comment);
  887. }
  888. /// <summary>
  889. /// Creates an HTML comment node with the specified comment text.
  890. /// </summary>
  891. /// <param name="comment">The comment text. May not be null.</param>
  892. /// <returns>The new HTML comment node.</returns>
  893. public HtmlCommentNode CreateComment(string comment)
  894. {
  895. if (comment == null)
  896. {
  897. throw new ArgumentNullException("comment");
  898. }
  899. HtmlCommentNode c = CreateComment();
  900. c.Comment = comment;
  901. return c;
  902. }
  903. /// <summary>
  904. /// Creates an HTML text node.
  905. /// </summary>
  906. /// <returns>The new HTML text node.</returns>
  907. public HtmlTextNode CreateTextNode()
  908. {
  909. return (HtmlTextNode)CreateNode(HtmlNodeType.Text);
  910. }
  911. /// <summary>
  912. /// Creates an HTML text node with the specified text.
  913. /// </summary>
  914. /// <param name="text">The text of the node. May not be null.</param>
  915. /// <returns>The new HTML text node.</returns>
  916. public HtmlTextNode CreateTextNode(string text)
  917. {
  918. if (text == null)
  919. {
  920. throw new ArgumentNullException("text");
  921. }
  922. HtmlTextNode t = CreateTextNode();
  923. t.Text = text;
  924. return t;
  925. }
  926. internal HtmlNode CreateNode(HtmlNodeType type)
  927. {
  928. return CreateNode(type, -1);
  929. }
  930. internal HtmlNode CreateNode(HtmlNodeType type, int index)
  931. {
  932. switch (type)
  933. {
  934. case HtmlNodeType.Comment:
  935. return new HtmlCommentNode(this, index);
  936. case HtmlNodeType.Text:
  937. return new HtmlTextNode(this, index);
  938. default:
  939. return new HtmlNode(type, this, index);
  940. }
  941. }
  942. internal HtmlAttribute CreateAttribute()
  943. {
  944. return new HtmlAttribute(this);
  945. }
  946. /// <summary>
  947. /// Creates an HTML attribute with the specified name.
  948. /// </summary>
  949. /// <param name="name">The name of the attribute. May not be null.</param>
  950. /// <returns>The new HTML attribute.</returns>
  951. public HtmlAttribute CreateAttribute(string name)
  952. {
  953. if (name == null)
  954. {
  955. throw new ArgumentNullException("name");
  956. }
  957. HtmlAttribute att = CreateAttribute();
  958. att.Name = name;
  959. return att;
  960. }
  961. /// <summary>
  962. /// Creates an HTML attribute with the specified name.
  963. /// </summary>
  964. /// <param name="name">The name of the attribute. May not be null.</param>
  965. /// <param name="value">The value of the attribute.</param>
  966. /// <returns>The new HTML attribute.</returns>
  967. public HtmlAttribute CreateAttribute(string name, string value)
  968. {
  969. if (name == null)
  970. {
  971. throw new ArgumentNullException("name");
  972. }
  973. HtmlAttribute att = CreateAttribute(name);
  974. att.Value = value;
  975. return att;
  976. }
  977. /// <summary>
  978. /// Gets the root node of the document.
  979. /// </summary>
  980. public HtmlNode DocumentNode
  981. {
  982. get
  983. {
  984. return _documentnode;
  985. }
  986. }
  987. /// <summary>
  988. /// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
  989. /// </summary>
  990. public int CheckSum
  991. {
  992. get
  993. {
  994. if (_crc32 == null)
  995. {
  996. return 0;
  997. }
  998. else
  999. {
  1000. return (int)_crc32.CheckSum;
  1001. }
  1002. }
  1003. }
  1004. private HtmlParseError AddError(
  1005. HtmlParseErrorCode code,
  1006. int line,
  1007. int linePosition,
  1008. int streamPosition,
  1009. string sourceText,
  1010. string reason)
  1011. {
  1012. HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
  1013. _parseerrors.Add(err);
  1014. return err;
  1015. }
  1016. private enum ParseState
  1017. {
  1018. Text,
  1019. WhichTag,
  1020. Tag,
  1021. BetweenAttributes,
  1022. EmptyTag,
  1023. AttributeName,
  1024. AttributeBeforeEquals,
  1025. AttributeAfterEquals,
  1026. AttributeValue,
  1027. Comment,
  1028. QuotedAttributeValue,
  1029. ServerSideCode,
  1030. PcData
  1031. }
  1032. private void IncrementPosition()
  1033. {
  1034. if (_crc32 != null)
  1035. {
  1036. // REVIEW: should we add some checksum code in DecrementPosition too?
  1037. _crc32.AddToCRC32(_c);
  1038. }
  1039. _index++;
  1040. _maxlineposition = _lineposition;
  1041. if (_c == 10)
  1042. {
  1043. _lineposition = 1;
  1044. _line++;
  1045. }
  1046. else
  1047. {
  1048. _lineposition++;
  1049. }
  1050. }
  1051. private void DecrementPosition()
  1052. {
  1053. _index--;
  1054. if (_lineposition == 1)
  1055. {
  1056. _lineposition = _maxlineposition;
  1057. _line--;
  1058. }
  1059. else
  1060. {
  1061. _lineposition--;
  1062. }
  1063. }
  1064. private void Parse()
  1065. {
  1066. int lastquote = 0;
  1067. if (OptionComputeChecksum)
  1068. {
  1069. _crc32 = new Crc32();
  1070. }
  1071. _lastnodes = new Hashtable();
  1072. _c = 0;
  1073. _fullcomment = false;
  1074. _parseerrors = new ArrayList();
  1075. _line = 1;
  1076. _lineposition = 1;
  1077. _maxlineposition = 1;
  1078. _state = ParseState.Text;
  1079. _oldstate = _state;
  1080. _documentnode._innerlength = _text.Length;
  1081. _documentnode._outerlength = _text.Length;
  1082. _remainderOffset = _text.Length;
  1083. _lastparentnode = _documentnode;
  1084. _currentnode = CreateNode(HtmlNodeType.Text, 0);
  1085. _currentattribute = null;
  1086. _index = 0;
  1087. PushNodeStart(HtmlNodeType.Text, 0);
  1088. while (_index<_text.Length)
  1089. {
  1090. _c = _text[_index];
  1091. IncrementPosition();
  1092. switch(_state)
  1093. {
  1094. case ParseState.Text:
  1095. if (NewCheck())
  1096. continue;
  1097. break;
  1098. case ParseState.WhichTag:
  1099. if (NewCheck())
  1100. continue;
  1101. if (_c == '/')
  1102. {
  1103. PushNodeNameStart(false, _index);
  1104. }
  1105. else
  1106. {
  1107. PushNodeNameStart(true, _index-1);
  1108. DecrementPosition();
  1109. }
  1110. _state = ParseState.Tag;
  1111. break;
  1112. case ParseState.Tag:
  1113. if (NewCheck())
  1114. continue;
  1115. if (IsWhiteSpace(_c))
  1116. {
  1117. PushNodeNameEnd(_index-1);
  1118. if (_state != ParseState.Tag)
  1119. continue;
  1120. _state = ParseState.BetweenAttributes;
  1121. continue;
  1122. }
  1123. if (_c == '/')
  1124. {
  1125. PushNodeNameEnd(_index-1);
  1126. if (_state != ParseState.Tag)
  1127. continue;
  1128. _state = ParseState.EmptyTag;
  1129. continue;
  1130. }
  1131. if (_c == '>')
  1132. {
  1133. PushNodeNameEnd(_index-1);
  1134. if (_state != ParseState.Tag)
  1135. continue;
  1136. if (!PushNodeEnd(_index, false))
  1137. {
  1138. // stop parsing
  1139. _index = _text.Length;
  1140. break;
  1141. }
  1142. if (_state != ParseState.Tag)
  1143. continue;
  1144. _state = ParseState.Text;
  1145. PushNodeStart(HtmlNodeType.Text, _index);
  1146. }
  1147. break;
  1148. case ParseState.BetweenAttributes:
  1149. if (NewCheck())
  1150. continue;
  1151. if (IsWhiteSpace(_c))
  1152. continue;
  1153. if ((_c == '/') || (_c == '?'))
  1154. {
  1155. _state = ParseState.EmptyTag;
  1156. continue;
  1157. }
  1158. if (_c == '>')
  1159. {
  1160. if (!PushNodeEnd(_index, false))
  1161. {
  1162. // stop parsing
  1163. _index = _text.Length;
  1164. break;
  1165. }
  1166. if (_state != ParseState.BetweenAttributes)
  1167. continue;
  1168. _state = ParseState.Text;
  1169. PushNodeStart(HtmlNodeType.Text, _index);
  1170. continue;
  1171. }
  1172. PushAttributeNameStart(_index-1);
  1173. _state = ParseState.AttributeName;
  1174. break;
  1175. case ParseState.EmptyTag:
  1176. if (NewCheck())
  1177. continue;
  1178. if (_c == '>')
  1179. {
  1180. if (!PushNodeEnd(_index, true))
  1181. {
  1182. // stop parsing
  1183. _index = _text.Length;
  1184. break;
  1185. }
  1186. if (_state != ParseState.EmptyTag)
  1187. continue;
  1188. _state = ParseState.Text;
  1189. PushNodeStart(HtmlNodeType.Text, _index);
  1190. continue;
  1191. }
  1192. _state = ParseState.BetweenAttributes;
  1193. break;
  1194. case ParseState.AttributeName:
  1195. if (NewCheck())
  1196. continue;
  1197. if (IsWhiteSpace(_c))
  1198. {
  1199. PushAttributeNameEnd(_index-1);
  1200. _state = ParseState.AttributeBeforeEquals;
  1201. continue;
  1202. }
  1203. if (_c == '=')
  1204. {
  1205. PushAttributeNameEnd(_index-1);
  1206. _state = ParseState.AttributeAfterEquals;
  1207. continue;
  1208. }
  1209. if (_c == '>')
  1210. {
  1211. PushAttributeNameEnd(_index-1);
  1212. if (!PushNodeEnd(_index, false))
  1213. {
  1214. // stop parsing
  1215. _index = _text.Length;
  1216. break;
  1217. }
  1218. if (_state != ParseState.AttributeName)
  1219. continue;
  1220. _state = ParseState.Text;
  1221. PushNodeStart(HtmlNodeType.Text, _index);
  1222. continue;
  1223. }
  1224. break;
  1225. case ParseState.AttributeBeforeEquals:
  1226. if (NewCheck())
  1227. continue;
  1228. if (IsWhiteSpace(_c))
  1229. continue;
  1230. if (_c == '>')
  1231. {
  1232. if (!PushNodeEnd(_index, false))
  1233. {
  1234. // stop parsing
  1235. _index = _text.Length;
  1236. break;
  1237. }
  1238. if (_state != ParseState.AttributeBeforeEquals)
  1239. continue;
  1240. _state = ParseState.Text;
  1241. PushNodeStart(HtmlNodeType.Text, _index);
  1242. continue;
  1243. }
  1244. if (_c == '=')
  1245. {
  1246. _state = ParseState.AttributeAfterEquals;
  1247. continue;
  1248. }
  1249. // no equals, no whitespace, it's a new attrribute starting
  1250. _state = ParseState.BetweenAttributes;
  1251. DecrementPosition();
  1252. break;
  1253. case ParseState.AttributeAfterEquals:
  1254. if (NewCheck())
  1255. continue;
  1256. if (IsWhiteSpace(_c))
  1257. continue;
  1258. if ((_c == '\'') || (_c == '"'))
  1259. {
  1260. _state = ParseState.QuotedAttributeValue;
  1261. PushAttributeValueStart(_index);
  1262. lastquote = _c;
  1263. continue;
  1264. }
  1265. if (_c == '>')
  1266. {
  1267. if (!PushNodeEnd(_index, false))
  1268. {
  1269. // stop parsing
  1270. _index = _text.Length;
  1271. break;
  1272. }
  1273. if (_state != ParseState.AttributeAfterEquals)
  1274. continue;
  1275. _state = ParseState.Text;
  1276. PushNodeStart(HtmlNodeType.Text, _index);
  1277. continue;
  1278. }
  1279. PushAttributeValueStart(_index-1);
  1280. _state = ParseState.AttributeValue;
  1281. break;
  1282. case ParseState.AttributeValue:
  1283. if (NewCheck())
  1284. continue;
  1285. if (IsWhiteSpace(_c))
  1286. {
  1287. PushAttributeValueEnd(_index-1);
  1288. _state = ParseState.BetweenAttributes;
  1289. continue;
  1290. }
  1291. if (_c == '>')
  1292. {
  1293. PushAttributeValueEnd(_index-1);
  1294. if (!PushNodeEnd(_index, false))
  1295. {
  1296. // stop parsing
  1297. _index = _text.Length;
  1298. break;
  1299. }
  1300. if (_state != ParseState.AttributeValue)
  1301. continue;
  1302. _state = ParseState.Text;
  1303. PushNodeStart(HtmlNodeType.Text, _index);
  1304. continue;
  1305. }
  1306. break;
  1307. case ParseState.QuotedAttributeValue:
  1308. if (_c == lastquote)
  1309. {
  1310. PushAttributeValueEnd(_index-1);
  1311. _state = ParseState.BetweenAttributes;
  1312. continue;
  1313. }
  1314. if (_c == '<')
  1315. {
  1316. if (_index<_text.Length)
  1317. {
  1318. if (_text[_index] == '%')
  1319. {
  1320. _oldstate = _state;
  1321. _state = ParseState.ServerSideCode;
  1322. continue;
  1323. }
  1324. }
  1325. }
  1326. break;
  1327. case ParseState.Comment:
  1328. if (_c == '>')
  1329. {
  1330. if (_fullcomment)
  1331. {
  1332. if ((_text[_index-2] != '-') ||
  1333. (_text[_index-3] != '-'))
  1334. {
  1335. continue;
  1336. }
  1337. }
  1338. if (!PushNodeEnd(_index, false))
  1339. {
  1340. // stop parsing
  1341. _index = _text.Length;
  1342. break;
  1343. }
  1344. _state = ParseState.Text;
  1345. PushNodeStart(HtmlNodeType.Text, _index);
  1346. continue;
  1347. }
  1348. break;
  1349. case ParseState.ServerSideCode:
  1350. if (_c == '%')
  1351. {
  1352. if (_index<_text.Length)
  1353. {
  1354. if (_text[_index] == '>')
  1355. {
  1356. switch(_oldstate)
  1357. {
  1358. case ParseState.AttributeAfterEquals:
  1359. _state = ParseState.AttributeValue;
  1360. break;
  1361. case ParseState.BetweenAttributes:
  1362. PushAttributeNameEnd(_index+1);
  1363. _state = ParseState.BetweenAttributes;
  1364. break;
  1365. default:
  1366. _state = _oldstate;
  1367. break;
  1368. }
  1369. IncrementPosition();
  1370. }
  1371. }
  1372. }
  1373. break;
  1374. case ParseState.PcData:
  1375. // look for </tag + 1 char
  1376. // check buffer end
  1377. if ((_currentnode._namelength+3)<=(_text.Length-(_index-1)))
  1378. {
  1379. if (string.Compare(_text.Substring(_index-1, _currentnode._namelength+2),
  1380. "</" + _currentnode.Name, true) == 0)
  1381. {
  1382. int c = _text[_index-1 + 2 + _currentnode.Name.Length];
  1383. if ((c == '>') || (IsWhiteSpace(c)))
  1384. {
  1385. // add the script as a text node
  1386. HtmlNode script = CreateNode(HtmlNodeType.Text,
  1387. _currentnode._outerstartindex + _currentnode._outerlength);
  1388. script._outerlength = _index-1 - script._outerstartindex;
  1389. _currentnode.AppendChild(script);
  1390. PushNodeStart(HtmlNodeType.Element, _index-1);
  1391. PushNodeNameStart(false, _index-1 +2);
  1392. _state = ParseState.Tag;
  1393. IncrementPosition();
  1394. }
  1395. }
  1396. }
  1397. break;
  1398. }
  1399. }
  1400. // finish the current work
  1401. if (_currentnode._namestartindex > 0)
  1402. {
  1403. PushNodeNameEnd(_index);
  1404. }
  1405. PushNodeEnd(_index, false);
  1406. // we don't need this anymore
  1407. _lastnodes.Clear();
  1408. }
  1409. private bool NewCheck()
  1410. {
  1411. if (_c != '<')
  1412. {
  1413. return false;
  1414. }
  1415. if (_index<_text.Length)
  1416. {
  1417. if (_text[_index] == '%')
  1418. {
  1419. switch(_state)
  1420. {
  1421. case ParseState.AttributeAfterEquals:
  1422. PushAttributeValueStart(_index-1);
  1423. break;
  1424. case ParseState.BetweenAttributes:
  1425. PushAttributeNameStart(_index-1);
  1426. break;
  1427. case ParseState.WhichTag:
  1428. PushNodeNameStart(true, _index-1);
  1429. _state = ParseState.Tag;
  1430. break;
  1431. }
  1432. _oldstate = _state;
  1433. _state = ParseState.ServerSideCode;
  1434. return true;
  1435. }
  1436. }
  1437. if (!PushNodeEnd(_index-1, true))
  1438. {
  1439. // stop parsing
  1440. _index = _text.Length;
  1441. return true;
  1442. }
  1443. _state = ParseState.WhichTag;
  1444. if ((_index-1) <= (_text.Length-2))
  1445. {
  1446. if (_text[_index] == '!')
  1447. {
  1448. PushNodeStart(HtmlNodeType.Comment, _index-1);
  1449. PushNodeNameStart(true, _index);
  1450. PushNodeNameEnd(_index+1);
  1451. _state = ParseState.Comment;
  1452. if (_index<(_text.Length-2))
  1453. {
  1454. if ((_text[_index+1] == '-') &&
  1455. (_text[_index+2] == '-'))
  1456. {
  1457. _fullcomment = true;
  1458. }
  1459. else
  1460. {
  1461. _fullcomment = false;
  1462. }
  1463. }
  1464. return true;
  1465. }
  1466. }
  1467. PushNodeStart(HtmlNodeType.Element, _index-1);
  1468. return true;
  1469. }
  1470. private void ReadDocumentEncoding(HtmlNode node)
  1471. {
  1472. if (!OptionReadEncoding)
  1473. return;
  1474. // format is
  1475. // <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
  1476. // when we append a child, we are in node end, so attributes are already populated
  1477. if (node._namelength == 4) // quick check, avoids string alloc
  1478. {
  1479. if (node.Name == "meta") // all nodes names are lowercase
  1480. {
  1481. HtmlAttribute att = node.Attributes["http-equiv"];
  1482. if (att != null)
  1483. {
  1484. if (string.Compare(att.Value, "content-type", true) == 0)
  1485. {
  1486. HtmlAttribute content = node.Attributes["content"];
  1487. if (content != null)
  1488. {
  1489. string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
  1490. if (charset != null)
  1491. {
  1492. _declaredencoding = Encoding.GetEncoding(charset);
  1493. if (_onlyDetectEncoding)
  1494. {
  1495. throw new EncodingFoundException(_declaredencoding);
  1496. }
  1497. if (_streamencoding != null)
  1498. {
  1499. if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
  1500. {
  1501. AddError(
  1502. HtmlParseErrorCode.CharsetMismatch,
  1503. _line, _lineposition,
  1504. _index, node.OuterHtml,
  1505. "Encoding mismatch between StreamEncoding: " +
  1506. _streamencoding.WebName + " and DeclaredEncoding: " + _declaredencoding.WebName);
  1507. }
  1508. }
  1509. }
  1510. }
  1511. }
  1512. }
  1513. }
  1514. }
  1515. }
  1516. private void PushAttributeNameStart(int index)
  1517. {
  1518. _currentattribute = CreateAttribute();
  1519. _currentattribute._namestartindex = index;
  1520. _currentattribute._line = _line;
  1521. _currentattribute._lineposition = _lineposition;
  1522. _currentattribute._streamposition = index;
  1523. }
  1524. private void PushAttributeNameEnd(int index)
  1525. {
  1526. _currentattribute._namelength = index - _currentattribute._namestartindex;
  1527. _currentnode.Attributes.Append(_currentattribute);
  1528. }
  1529. private void PushAttributeValueStart(int index)
  1530. {
  1531. _currentattribute._valuestartindex = index;
  1532. }
  1533. private void PushAttributeValueEnd(int index)
  1534. {
  1535. _currentattribute._valuelength = index - _currentattribute._valuestartindex;
  1536. }
  1537. private void PushNodeStart(HtmlNodeType type, int index)
  1538. {
  1539. _currentnode = CreateNode(type, index);
  1540. _currentnode._line = _line;
  1541. _currentnode._lineposition = _lineposition;
  1542. if (type == HtmlNodeType.Element)
  1543. {
  1544. _currentnode._lineposition--;
  1545. }
  1546. _currentnode._streamposition = index;
  1547. }
  1548. private bool PushNodeEnd(int index, bool close)
  1549. {
  1550. _currentnode._outerlength = index - _currentnode._outerstartindex;
  1551. if ((_currentnode._nodetype == HtmlNodeType.Text) ||
  1552. (_currentnode._nodetype == HtmlNodeType.Comment))
  1553. {
  1554. // forget about void nodes
  1555. if (_currentnode._outerlength>0)
  1556. {
  1557. _currentnode._innerlength = _currentnode._outerlength;
  1558. _currentnode._innerstartindex = _currentnode._outerstartindex;
  1559. if (_lastparentnode != null)
  1560. {
  1561. _lastparentnode.AppendChild(_currentnode);
  1562. }
  1563. }
  1564. }
  1565. else
  1566. {
  1567. if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
  1568. {
  1569. // add to parent node
  1570. if (_lastparentnode != null)
  1571. {
  1572. _lastparentnode.AppendChild(_currentnode);
  1573. }
  1574. ReadDocumentEncoding(_currentnode);
  1575. // remember last node of this kind
  1576. HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
  1577. _currentnode._prevwithsamename = prev;
  1578. _lastnodes[_currentnode.Name] = _currentnode;
  1579. // change parent?
  1580. if ((_currentnode.NodeType == HtmlNodeType.Document) ||
  1581. (_currentnode.NodeType == HtmlNodeType.Element))
  1582. {
  1583. _lastparentnode = _currentnode;
  1584. }
  1585. if (HtmlNode.IsCDataElement(CurrentNodeName()))
  1586. {
  1587. _state = ParseState.PcData;
  1588. return true;
  1589. }
  1590. if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
  1591. (HtmlNode.IsEmptyElement(_currentnode.Name)))
  1592. {
  1593. close = true;
  1594. }
  1595. }
  1596. }
  1597. if ((close) || (!_currentnode._starttag))
  1598. {
  1599. if ((OptionStopperNodeName != null) && (_remainder == null) &&
  1600. (string.Compare(_currentnode.Name, OptionStopperNodeName, true) == 0))
  1601. {
  1602. _remainderOffset = index;
  1603. _remainder = _text.Substring(_remainderOffset);
  1604. CloseCurrentNode();
  1605. return false; // stop parsing
  1606. }
  1607. CloseCurrentNode();
  1608. }
  1609. return true;
  1610. }
  1611. private void PushNodeNameStart(bool starttag, int index)
  1612. {
  1613. _currentnode._starttag = starttag;
  1614. _currentnode._namestartindex = index;
  1615. }
  1616. private string[] GetResetters(string name)
  1617. {
  1618. switch (name)
  1619. {
  1620. case "li":
  1621. return new string[]{"ul"};
  1622. case "tr":
  1623. return new string[]{"table"};
  1624. case "th":
  1625. case "td":
  1626. return new string[]{"tr", "table"};
  1627. default:
  1628. return null;
  1629. }
  1630. }
  1631. private void FixNestedTags()
  1632. {
  1633. // we are only interested by start tags, not closing tags
  1634. if (!_currentnode._starttag)
  1635. return;
  1636. string name = CurrentNodeName().ToLower();
  1637. FixNestedTag(name, GetResetters(name));
  1638. }
  1639. private void FixNestedTag(string name, string[] resetters)
  1640. {
  1641. if (resetters == null)
  1642. return;
  1643. HtmlNode prev;
  1644. // if we find a previous unclosed same name node, without a resetter node between, we must close it
  1645. prev = (HtmlNode)_lastnodes[name];
  1646. if ((prev != null) && (!prev.Closed))
  1647. {
  1648. // try to find a resetter node, if found, we do nothing
  1649. if (FindResetterNodes(prev, resetters))
  1650. {
  1651. return;
  1652. }
  1653. // ok we need to close the prev now
  1654. // create a fake closer node
  1655. HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
  1656. close._endnode = close;
  1657. prev.CloseNode(close);
  1658. }
  1659. }
  1660. private bool FindResetterNodes(HtmlNode node, string[] names)
  1661. {
  1662. if (names == null)
  1663. {
  1664. return false;
  1665. }
  1666. for(int i=0;i<names.Length;i++)
  1667. {
  1668. if (FindResetterNode(node, names[i]) != null)
  1669. {
  1670. return true;
  1671. }
  1672. }
  1673. return false;
  1674. }
  1675. private HtmlNode FindResetterNode(HtmlNode node, string name)
  1676. {
  1677. HtmlNode resetter = (HtmlNode)_lastnodes[name];
  1678. if (resetter == null)
  1679. return null;
  1680. if (resetter.Closed)
  1681. {
  1682. return null;
  1683. }
  1684. if (resetter._streamposition<node._streamposition)
  1685. {
  1686. return null;
  1687. }
  1688. return resetter;
  1689. }
  1690. private void PushNodeNameEnd(int index)
  1691. {
  1692. _currentnode._namelength = index - _currentnode._namestartindex;
  1693. if (OptionFixNestedTags)
  1694. {
  1695. FixNestedTags();
  1696. }
  1697. }
  1698. private void CloseCurrentNode()
  1699. {
  1700. if (_currentnode.Closed) // text or document are by def closed
  1701. return;
  1702. bool error = false;
  1703. // find last node of this kind
  1704. HtmlNode prev = (HtmlNode)_lastnodes[_currentnode.Name];
  1705. if (prev == null)
  1706. {
  1707. if (HtmlNode.IsClosedElement(_currentnode.Name))
  1708. {
  1709. // </br> will be seen as <br>
  1710. _currentnode.CloseNode(_currentnode);
  1711. // add to parent node
  1712. if (_lastparentnode != null)
  1713. {
  1714. HtmlNode foundNode = null;
  1715. Stack futureChild = new Stack();
  1716. for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
  1717. {
  1718. if ((node.Name == _currentnode.Name) && (! node.HasChildNodes))
  1719. {
  1720. foundNode = node;
  1721. break;
  1722. }
  1723. futureChild.Push(node);
  1724. }
  1725. if (foundNode != null)
  1726. {
  1727. HtmlNode node = null;
  1728. while(futureChild.Count != 0)
  1729. {
  1730. node = (HtmlNode)futureChild.Pop();
  1731. _lastparentnode.RemoveChild(node);
  1732. foundNode.AppendChild(node);
  1733. }
  1734. }
  1735. else
  1736. {
  1737. _lastparentnode.AppendChild(_currentnode);
  1738. }
  1739. }
  1740. }
  1741. else
  1742. {
  1743. // node has no parent
  1744. // node is not a closed node
  1745. if (HtmlNode.CanOverlapElement(_currentnode.Name))
  1746. {
  1747. // this is a hack: add it as a text node
  1748. HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
  1749. closenode._outerlength = _currentnode._outerlength;
  1750. ((HtmlTextNode)closenode).Text = ((HtmlTextNode)closenode).Text.ToLower();
  1751. if (_lastparentnode != null)
  1752. {
  1753. _lastparentnode.AppendChild(closenode);
  1754. }
  1755. }
  1756. else
  1757. {
  1758. if (HtmlNode.IsEmptyElement(_currentnode.Name))
  1759. {
  1760. AddError(
  1761. HtmlParseErrorCode.EndTagNotRequired,
  1762. _currentnode._line, _currentnode._lineposition,
  1763. _currentnode._streamposition, _currentnode.OuterHtml,
  1764. "End tag </" + _currentnode.Name + "> is not required");
  1765. }
  1766. else
  1767. {
  1768. // node cannot overlap, node is not empty
  1769. AddError(
  1770. HtmlParseErrorCode.TagNotOpened,
  1771. _currentnode._line, _currentnode._lineposition,
  1772. _currentnode._streamposition, _currentnode.OuterHtml,
  1773. "Start tag <" + _currentnode.Name + "> was not found");
  1774. error = true;
  1775. }
  1776. }
  1777. }
  1778. }
  1779. else
  1780. {
  1781. if (OptionFixNestedTags)
  1782. {
  1783. if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
  1784. {
  1785. AddError(
  1786. HtmlParseErrorCode.EndTagInvalidHere,
  1787. _currentnode._line, _currentnode._lineposition,
  1788. _currentnode._streamposition, _currentnode.OuterHtml,
  1789. "End tag </" + _currentnode.Name + "> invalid here");
  1790. error = true;
  1791. }
  1792. }
  1793. if (!error)
  1794. {
  1795. _lastnodes[_currentnode.Name] = prev._prevwithsamename;
  1796. prev.CloseNode(_currentnode);
  1797. }
  1798. }
  1799. // we close this node, get grandparent
  1800. if (!error)
  1801. {
  1802. if ((_lastparentnode != null) &&
  1803. ((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
  1804. (_currentnode._starttag)))
  1805. {
  1806. UpdateLastParentNode();
  1807. }
  1808. }
  1809. }
  1810. internal void UpdateLastParentNode()
  1811. {
  1812. do
  1813. {
  1814. if (_lastparentnode.Closed)
  1815. {
  1816. _lastparentnode = _lastparentnode.ParentNode;
  1817. }
  1818. }
  1819. while ((_lastparentnode != null) && (_lastparentnode.Closed));
  1820. if (_lastparentnode == null)
  1821. {
  1822. _lastparentnode = _documentnode;
  1823. }
  1824. }
  1825. private string CurrentAttributeName()
  1826. {
  1827. return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
  1828. }
  1829. private string CurrentAttributeValue()
  1830. {
  1831. return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
  1832. }
  1833. private string CurrentNodeName()
  1834. {
  1835. return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
  1836. }
  1837. private string CurrentNodeOuter()
  1838. {
  1839. return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
  1840. }
  1841. private string CurrentNodeInner()
  1842. {
  1843. return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
  1844. }
  1845. /// <summary>
  1846. /// Determines if the specified character is considered as a whitespace character.
  1847. /// </summary>
  1848. /// <param name="c">The character to check.</param>
  1849. /// <returns>true if if the specified character is considered as a whitespace character.</returns>
  1850. public static bool IsWhiteSpace(int c)
  1851. {
  1852. if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
  1853. {
  1854. return true;
  1855. }
  1856. return false;
  1857. }
  1858. }
  1859. internal class EncodingFoundException: Exception
  1860. {
  1861. private Encoding _encoding;
  1862. internal EncodingFoundException(Encoding encoding)
  1863. {
  1864. _encoding = encoding;
  1865. }
  1866. internal Encoding Encoding
  1867. {
  1868. get
  1869. {
  1870. return _encoding;
  1871. }
  1872. }
  1873. }
  1874. }