XmlInputSource.cs 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. // -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
  2. //
  3. // System.Xml.XmlInputSource.cs
  4. // port of Open Xml TXmlInputSource class
  5. //
  6. // Author:
  7. // Daniel Weber ([email protected]
  8. //
  9. // (C) 2001 Daniel Weber
  10. //
  11. //
  12. using System;
  13. using System.IO;
  14. namespace System.Xml
  15. {
  16. internal class XmlInputSource
  17. {
  18. private DomEncodingType Fencoding;
  19. private string FpublicID;
  20. private string FsystemID;
  21. public string FrootName;
  22. private Stream FStream;
  23. private bool FLastCharWasCR;
  24. // locator
  25. int FColumnNumber;
  26. int FStartColumnNumber;
  27. int FStartLineNumber;
  28. bool FLastWCharWasLF;
  29. int FLineNumber;
  30. bool FPieceEndSet;
  31. // Buffer storage for UTF-8 surrogates
  32. // see http://www.ietf.org/rfc/rfc2279.txt for a complete description of UTF-8 encoding
  33. private int FLastUcs4;
  34. //FLocator: TdomStandardLocator;
  35. // public properties
  36. //===========================================================================
  37. public DomEncodingType encoding
  38. {
  39. get
  40. {
  41. return Fencoding;
  42. }
  43. }
  44. //property locator: TdomStandardLocator read FLocator;
  45. public string publicId
  46. {
  47. get
  48. {
  49. return FpublicID;
  50. }
  51. }
  52. public string rootName
  53. {
  54. get
  55. {
  56. return FrootName;
  57. }
  58. }
  59. public Stream stream
  60. {
  61. get
  62. {
  63. return FStream;
  64. }
  65. }
  66. public string streamAsWideString
  67. {
  68. get
  69. {
  70. return string.Empty;
  71. //wideString read getStreamAsWideString;
  72. }
  73. }
  74. public string systemId
  75. {
  76. get
  77. {
  78. return FsystemID;
  79. }
  80. }
  81. public int columnNumber
  82. {
  83. get { return FColumnNumber; }
  84. }
  85. public int lineNumber
  86. {
  87. get { return FLineNumber; }
  88. }
  89. public int startColumnNumber
  90. {
  91. get { return FStartColumnNumber; }
  92. }
  93. public int startLineNumber
  94. {
  95. get { return FStartLineNumber; }
  96. }
  97. // private methods
  98. //===========================================================================
  99. /// <summary>
  100. /// Analyze the first bytes of an XML document to try and determine encoding
  101. /// </summary>
  102. /// <returns>Determined encoding type, defaults to UTF-8</returns>
  103. private void setEncodingType()
  104. {
  105. try
  106. {
  107. byte[] buf = new byte[4];
  108. FStream.Seek(0, SeekOrigin.Begin);
  109. FStream.Read(buf, 0, 4);
  110. // UTF-16 code streams should begin with 0xfeff for big-endian systems
  111. // or 0xfffe for little endian systems.
  112. // check that first....
  113. if ( (buf[0] == 0xfe) & (buf[1] == 0xff) )
  114. Fencoding = DomEncodingType.etUTF16BE;
  115. else if ( (buf[0] == 0xff) & (buf[1] == 0xfe) )
  116. Fencoding = DomEncodingType.etUTF16LE;
  117. else
  118. {
  119. // assume utf-8, look for encoding in <?xml version="1.0" encoding="ISO-8859-6"> tag, eg
  120. Fencoding = DomEncodingType.etUTF8;
  121. // Check if the stream begins with <?[X|x][M|m][L|l]
  122. if ( (buf[0] == 0x3c) & (buf[1] == 0x3f) & // "<?"
  123. ((buf[2] == 0x78) | (buf[2] ==0x58) ) & // "x" or "X"
  124. ( (buf[2] == 0x6d) | (buf[3] ==0x4d) ) & // "m" or "M"
  125. ( (buf[2] == 0x6c) | (buf[2] ==0x4c) ) ) // "l" or "L"
  126. {
  127. string tag = "";
  128. while (FStream.Position != FStream.Length)
  129. {
  130. char c = getNextChar();
  131. tag += c;
  132. if (c == '>')
  133. break;
  134. }
  135. // start from the location of "encoding", and scan for quotes
  136. string encodeString = readEncodingAttrFromTag(tag);
  137. encodeString = encodeString.ToUpper();
  138. if ( (encodeString.IndexOf("ISO-8859-1") != -1) |
  139. (encodeString.IndexOf("LATIN-1") != -1) )
  140. Fencoding = DomEncodingType.etLatin1;
  141. else if ( (encodeString.IndexOf("ISO-8859-2") != -1) |
  142. (encodeString.IndexOf("LATIN-2") != -1) )
  143. Fencoding = DomEncodingType.etLatin2;
  144. else if ( (encodeString.IndexOf("ISO-8859-3") != -1) |
  145. (encodeString.IndexOf("LATIN-3") != -1) )
  146. Fencoding = DomEncodingType.etLatin3;
  147. else if ( (encodeString.IndexOf("ISO-8859-4") != -1) |
  148. (encodeString.IndexOf("LATIN-4") != -1) )
  149. Fencoding = DomEncodingType.etLatin4;
  150. else if ( (encodeString.IndexOf("ISO-8859-5") != -1) |
  151. (encodeString.IndexOf("CYRILLIC") != -1) )
  152. Fencoding = DomEncodingType.etCyrillic;
  153. else if ( (encodeString.IndexOf("ISO-8859-6") != -1) |
  154. (encodeString.IndexOf("ARABIC") != -1) )
  155. Fencoding = DomEncodingType.etArabic;
  156. else if ( (encodeString.IndexOf("ISO-8859-7") != -1) |
  157. (encodeString.IndexOf("GREEK") != -1) )
  158. Fencoding = DomEncodingType.etGreek;
  159. else if ( (encodeString.IndexOf("ISO-8859-8") != -1) |
  160. (encodeString.IndexOf("HEBREW") != -1) )
  161. Fencoding = DomEncodingType.etHebrew;
  162. else if ( (encodeString.IndexOf("ISO-8859-9") != -1) |
  163. (encodeString.IndexOf("LATIN-5") != -1) )
  164. Fencoding = DomEncodingType.etLatin5;
  165. else if ( (encodeString.IndexOf("ISO-8859-10") != -1) |
  166. (encodeString.IndexOf("LATIN-6") != -1) )
  167. Fencoding = DomEncodingType.etLatin6;
  168. else if ( (encodeString.IndexOf("ISO-8859-13") != -1) |
  169. (encodeString.IndexOf("LATIN-7") != -1) )
  170. Fencoding = DomEncodingType.etLatin7;
  171. else if ( (encodeString.IndexOf("ISO-8859-14") != -1) |
  172. (encodeString.IndexOf("LATIN-8") != -1) )
  173. Fencoding = DomEncodingType.etLatin8;
  174. else if ( (encodeString.IndexOf("ISO-8859-15") != -1) |
  175. (encodeString.IndexOf("LATIN-9") != -1) )
  176. Fencoding = DomEncodingType.etLatin9;
  177. else if (encodeString.IndexOf("KOI8-R") != -1)
  178. Fencoding = DomEncodingType.etKOI8R;
  179. else if (encodeString.IndexOf("CP10000_MACROMAN") != -1)
  180. Fencoding = DomEncodingType.etcp10000_MacRoman;
  181. else if ( (encodeString.IndexOf("Windows-1250") != -1) |
  182. (encodeString.IndexOf("CP1250") != -1) )
  183. Fencoding = DomEncodingType.etcp1250;
  184. else if ( (encodeString.IndexOf("Windows-1251") != -1) |
  185. (encodeString.IndexOf("CP1251") != -1) )
  186. Fencoding = DomEncodingType.etcp1251;
  187. else if ( (encodeString.IndexOf("Windows-1252") != -1) |
  188. (encodeString.IndexOf("CP1252") != -1) )
  189. Fencoding = DomEncodingType.etcp1252;
  190. }
  191. }
  192. }
  193. catch
  194. {
  195. Fencoding = DomEncodingType.etUTF8;
  196. }
  197. FStream.Seek(0, SeekOrigin.Begin);
  198. }
  199. /// <summary>
  200. /// Helper function to try and find the encoding attribute value in
  201. /// declaration tag. Does not do well-formedness checks.
  202. /// </summary>
  203. /// <param name="tag">string to scan</param>
  204. /// <exception cref="InvalidOperationException">If bad encoding char found, mis-matched quotes, or no equals sign.</exception>
  205. /// <returns>encoding, or string.Empty if it is not found.</returns>
  206. private string readEncodingAttrFromTag( string tag )
  207. {
  208. int encodeIndex = tag.IndexOf("encoding");
  209. if ( encodeIndex == -1)
  210. return string.Empty;
  211. else
  212. {
  213. int curIndex = encodeIndex + "encoding".Length;
  214. bool firstQuoteFound = false;
  215. bool equalsFound = false;
  216. char quoteChar = (char) 0xffff; // c# insists on initialization...
  217. string encoding = "";
  218. while ( curIndex != tag.Length )
  219. {
  220. char c = tag[curIndex];
  221. curIndex++;
  222. if ( c == '=')
  223. {
  224. equalsFound = true;
  225. continue;
  226. }
  227. if ( (c== '\"') | (c=='\'') )
  228. {
  229. if ( !firstQuoteFound & !equalsFound)
  230. throw new InvalidOperationException("No equals sign found in encoding attribute");
  231. else if ( firstQuoteFound )
  232. {
  233. if (c == quoteChar)
  234. return encoding;
  235. else
  236. throw new InvalidOperationException("non-matching quotes in attribute value");
  237. }
  238. else
  239. {
  240. firstQuoteFound = true;
  241. quoteChar = c;
  242. continue;
  243. }
  244. }
  245. else if (firstQuoteFound)
  246. {
  247. if ( ( c >= 'a') & ( c <= 'z')) encoding += c;
  248. else if ( ( c >= 'A') & ( c <= 'Z')) encoding += c;
  249. else if ( ( c >= '0') & ( c <= '9')) encoding += c;
  250. else if ( c == '_' ) encoding += c;
  251. else if ( c == '-') encoding += c;
  252. else if (c == '.') encoding += c;
  253. else
  254. throw new InvalidOperationException("invalid character in encoding attribute");
  255. }
  256. }
  257. return string.Empty;
  258. }
  259. }
  260. private void evaluate(char c)
  261. {
  262. if (FLastWCharWasLF)
  263. {
  264. FLineNumber++;
  265. FLastWCharWasLF = false;
  266. FColumnNumber = 1;
  267. }
  268. else
  269. FColumnNumber++;
  270. if (c == (char) 10 )
  271. FLastWCharWasLF = true;
  272. if (FPieceEndSet)
  273. pieceStart();
  274. }
  275. public void pieceEnd()
  276. {
  277. FPieceEndSet = true;
  278. }
  279. public void pieceStart()
  280. {
  281. FStartColumnNumber = FColumnNumber;
  282. FStartLineNumber = FLineNumber;
  283. FPieceEndSet = false;
  284. }
  285. /// <summary>
  286. /// Return true if input stream is at EOF.
  287. /// </summary>
  288. /// <returns></returns>
  289. public bool atEOF()
  290. {
  291. return (FStream.Length == FStream.Position);
  292. }
  293. /// <summary>
  294. /// Sets the internal root name by analyzing the tags at the beginning of the stream.
  295. /// root name is:
  296. /// - the element tag of the first element found
  297. /// - the root name listed in a !DOCTYPE tag
  298. /// - empty if a parse error occurs, or no applicable tags are found.
  299. /// Does not do well-formedness checks - skips comments and proc. instructions
  300. /// </summary>
  301. private void getRootName()
  302. {
  303. reset();
  304. FrootName = string.Empty;
  305. while ( ! atEOF() )
  306. {
  307. string tag = "<";
  308. char c = getNextChar();
  309. // skip whitespace to first tag
  310. while ( !atEOF() && (XmlNames_1_0.IsXmlWhiteSpace( c )) )
  311. c = getNextChar();
  312. if ( (c != '<') | atEOF() ) break;
  313. while ( !atEOF() & ( c != '>' ) )
  314. {
  315. c = getNextChar();
  316. tag += c;
  317. }
  318. if ( atEOF() ) break;
  319. // Only allow 1) comments, 2) processing instructions before <!DOCTYPE ...>
  320. if ( tag.StartsWith("<?") ) // Processing instruction
  321. continue;
  322. else if ( tag.StartsWith("<--") ) // comment
  323. continue;
  324. else if ( tag.StartsWith("<!DOCTYPE") ) // what we're looking for...
  325. {
  326. setRootName( tag );
  327. break;
  328. }
  329. // no DOCTYPE tag? Use the first element tag as the root
  330. else if ( tag.StartsWith( "<" ) )
  331. setRootName( tag );
  332. // we hit a non-comment, processing instruction or declaration, we ain't gonna get it
  333. else
  334. {
  335. FrootName = string.Empty;
  336. break;
  337. }
  338. }
  339. }
  340. private void setRootName( string doctypeTag )
  341. {
  342. int start = doctypeTag.IndexOf("<DOCTYPE");
  343. if ( start == -1 )
  344. start = 1; // set from element
  345. else
  346. start += "<DOCTYPE".Length;
  347. while ( ( start != doctypeTag.Length ) & XmlNames_1_0.IsXmlWhiteSpace( doctypeTag[start] ) )
  348. start++;
  349. string tmp = string.Empty;
  350. while ( ( start != doctypeTag.Length ) &&
  351. !XmlNames_1_0.IsXmlWhiteSpace(doctypeTag[start]) &&
  352. (doctypeTag[start] != '>') &&
  353. (doctypeTag[start] != '[') &&
  354. (doctypeTag[start] != '/') )
  355. tmp += doctypeTag[start];
  356. if (XmlNames_1_0.isXmlName(tmp) ) FrootName = tmp;
  357. }
  358. /// <summary>
  359. /// Read in the next character (either UTF-8 or UTF-16) and convert by charset
  360. /// Normalize CR/LF pairs to single CR.
  361. /// </summary>
  362. /// <returns></returns>
  363. public char getNextChar()
  364. {
  365. byte[] buf = new byte[2];
  366. char retval = (char) 0xffff;
  367. int bCount;
  368. switch(Fencoding)
  369. {
  370. case DomEncodingType.etLatin1:
  371. bCount = stream.Read(buf,0, 1);
  372. if (bCount == 1)
  373. retval = XmlUtil.Iso8859_1ToUTF16Char(buf[0]);
  374. break;
  375. case DomEncodingType.etLatin2:
  376. bCount = stream.Read(buf, 0, 1);
  377. if (bCount == 1)
  378. retval = XmlUtil.Iso8859_2ToUTF16Char(buf[0]);
  379. break;
  380. case DomEncodingType.etLatin3:
  381. bCount = stream.Read(buf, 0, 1);
  382. if (bCount == 1)
  383. retval = XmlUtil.Iso8859_3ToUTF16Char(buf[0]);
  384. break;
  385. case DomEncodingType.etLatin4:
  386. bCount = stream.Read(buf, 0, 1);
  387. if (bCount == 1)
  388. retval = XmlUtil.Iso8859_4ToUTF16Char(buf[0]);
  389. break;
  390. case DomEncodingType.etCyrillic:
  391. bCount = stream.Read(buf, 0, 1);
  392. if (bCount == 1)
  393. retval = XmlUtil.Iso8859_5ToUTF16Char(buf[0]);
  394. break;
  395. case DomEncodingType.etArabic:
  396. bCount = stream.Read(buf, 0, 1);
  397. if (bCount == 1)
  398. retval = XmlUtil.Iso8859_6ToUTF16Char(buf[0]);
  399. break;
  400. case DomEncodingType.etGreek:
  401. bCount = stream.Read(buf, 0, 1);
  402. if (bCount == 1)
  403. retval = XmlUtil.Iso8859_7ToUTF16Char(buf[0]);
  404. break;
  405. case DomEncodingType.etHebrew:
  406. bCount = stream.Read(buf, 0, 1);
  407. if (bCount == 1)
  408. retval = XmlUtil.Iso8859_8ToUTF16Char(buf[0]);
  409. break;
  410. case DomEncodingType.etLatin5:
  411. bCount = stream.Read(buf, 0, 1);
  412. if (bCount == 1)
  413. retval = XmlUtil.Iso8859_9ToUTF16Char(buf[0]);
  414. break;
  415. case DomEncodingType.etLatin6:
  416. bCount = stream.Read(buf, 0, 1);
  417. if (bCount == 1)
  418. retval = XmlUtil.Iso8859_10ToUTF16Char(buf[0]);
  419. break;
  420. case DomEncodingType.etLatin7:
  421. bCount = stream.Read(buf, 0, 1);
  422. if (bCount == 1)
  423. retval = XmlUtil.Iso8859_13ToUTF16Char(buf[0]);
  424. break;
  425. case DomEncodingType.etLatin8:
  426. bCount = stream.Read(buf, 0, 1);
  427. if (bCount == 1)
  428. retval = XmlUtil.Iso8859_14ToUTF16Char(buf[0]);
  429. break;
  430. case DomEncodingType.etLatin9:
  431. bCount = stream.Read(buf, 0, 1);
  432. if (bCount == 1)
  433. retval = XmlUtil.Iso8859_15ToUTF16Char(buf[0]);
  434. break;
  435. case DomEncodingType.etKOI8R:
  436. bCount = stream.Read(buf, 0, 1);
  437. if (bCount == 1)
  438. retval = XmlUtil.KOI8_RToUTF16Char(buf[0]);
  439. break;
  440. case DomEncodingType.etcp10000_MacRoman:
  441. bCount = stream.Read(buf, 0, 1);
  442. if (bCount == 1)
  443. retval = XmlUtil.cp10000_MacRomanToUTF16Char(buf[0]);
  444. break;
  445. case DomEncodingType.etcp1250:
  446. bCount = stream.Read(buf, 0, 1);
  447. if (bCount == 1)
  448. retval = XmlUtil.cp1250ToUTF16Char(buf[0]);
  449. break;
  450. case DomEncodingType.etcp1251:
  451. bCount = stream.Read(buf, 0, 1);
  452. if (bCount == 1)
  453. retval = XmlUtil.cp1251ToUTF16Char(buf[0]);
  454. break;
  455. case DomEncodingType.etcp1252:
  456. bCount = stream.Read(buf, 0, 1);
  457. if (bCount == 1)
  458. retval = XmlUtil.cp1252ToUTF16Char(buf[0]);
  459. break;
  460. case DomEncodingType.etUTF8:
  461. if ( FLastUcs4 >= 0x10000)
  462. {
  463. // Output low surrogate
  464. retval = XmlUtil.Utf16LowSurrogate(FLastUcs4);
  465. FLastUcs4 = 0;
  466. }
  467. else
  468. {
  469. FLastUcs4 = XmlUtil.ReadUTF8Char( stream );
  470. if ( FLastUcs4 >= 0x10000)
  471. retval = XmlUtil.Utf16HighSurrogate(FLastUcs4);
  472. else
  473. retval = (char) FLastUcs4;
  474. }
  475. break;
  476. case DomEncodingType.etUTF16BE:
  477. bCount = stream.Read(buf, 0, 2);
  478. if (bCount == 2)
  479. retval = System.Convert.ToChar( (buf[0] << 16) + buf[1] );
  480. break;
  481. case DomEncodingType.etUTF16LE:
  482. bCount = stream.Read(buf, 0, 2);
  483. if (bCount == 2)
  484. retval = System.Convert.ToChar( (buf[1] << 16) + buf[0] );
  485. break;
  486. }
  487. // normalize CRLF or a single CR to LF:
  488. if ( (retval == 0x000D) & FLastCharWasCR) // 0x000d = CR
  489. {
  490. FLastCharWasCR = false;
  491. return getNextChar();
  492. }
  493. else if ( retval == 0x000A) // 0x000a = LF
  494. {
  495. FLastCharWasCR = true;
  496. return (char) 0x000D;
  497. }
  498. else
  499. FLastCharWasCR = false;
  500. evaluate(retval);
  501. return retval;
  502. }
  503. /// <summary>
  504. /// Reset the Input to the origin and clear internal variables.
  505. /// </summary>
  506. public void reset()
  507. {
  508. FLastUcs4 = 0;
  509. FLastCharWasCR = false;
  510. switch(Fencoding)
  511. {
  512. // skip the leading 0xfeff/oxfffe on UTF-16 streams
  513. case DomEncodingType.etUTF16BE:
  514. FStream.Seek(2, SeekOrigin.Begin);
  515. break;
  516. case DomEncodingType.etUTF16LE:
  517. FStream.Seek(2, SeekOrigin.Begin);
  518. break;
  519. default:
  520. FStream.Seek(0, SeekOrigin.Begin);
  521. break;
  522. }
  523. FColumnNumber = 0;
  524. FLineNumber = 0;
  525. FStartColumnNumber = 0;
  526. FStartLineNumber = 0;
  527. FLastWCharWasLF = true;
  528. pieceEnd();
  529. }
  530. /*
  531. * private
  532. protected
  533. function getStreamAsWideString: wideString; virtual;
  534. procedure skipTextDecl(const locator: TdomStandardLocator); virtual;
  535. public
  536. constructor create(const stream: TStream;
  537. const publicId,
  538. systemId: wideString); virtual;
  539. destructor destroy; override;
  540. */
  541. // Constructor
  542. //===========================================================================
  543. XmlInputSource(Stream inputStream, string publicID, string systemID)
  544. {
  545. if (inputStream == null)
  546. throw new NullReferenceException("Null stream passed to XmlInputSource constructor");
  547. FStream = inputStream;
  548. FLastUcs4 = 0;
  549. FLastCharWasCR = false;
  550. FpublicID = publicID;
  551. FsystemID = systemID;
  552. setEncodingType();
  553. //FLocator:= TdomStandardLocator.create(self);
  554. getRootName();
  555. }
  556. }
  557. }