Tokenizer.cs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510
  1. //
  2. // System.Xml.XPath.Tokenizer
  3. //
  4. // Author:
  5. // Piers Haken ([email protected])
  6. //
  7. // (C) 2002 Piers Haken
  8. //
  9. //
  10. // Permission is hereby granted, free of charge, to any person obtaining
  11. // a copy of this software and associated documentation files (the
  12. // "Software"), to deal in the Software without restriction, including
  13. // without limitation the rights to use, copy, modify, merge, publish,
  14. // distribute, sublicense, and/or sell copies of the Software, and to
  15. // permit persons to whom the Software is furnished to do so, subject to
  16. // the following conditions:
  17. //
  18. // The above copyright notice and this permission notice shall be
  19. // included in all copies or substantial portions of the Software.
  20. //
  21. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  22. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  23. // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  24. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  25. // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  26. // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  27. // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  28. //
  29. using System;
  30. using System.Globalization;
  31. using System.IO;
  32. using System.Text;
  33. using System.Collections;
  34. using Mono.Xml.XPath;
  35. using Mono.Xml.XPath.yyParser;
  36. namespace System.Xml.XPath
  37. {
  38. internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
  39. {
  40. private string m_rgchInput;
  41. private int m_ich;
  42. private int m_cch;
  43. private int m_iToken;
  44. private int m_iTokenPrev = Token.EOF;
  45. private Object m_objToken;
  46. private bool m_fPrevWasOperator = false;
  47. private bool m_fThisIsOperator = false;
  48. private static readonly Hashtable s_mapTokens = new Hashtable ();
  49. private static readonly Object [] s_rgTokenMap =
  50. {
  51. Token.AND, "and",
  52. Token.OR, "or",
  53. Token.DIV, "div",
  54. Token.MOD, "mod",
  55. Token.ANCESTOR, "ancestor",
  56. Token.ANCESTOR_OR_SELF, "ancestor-or-self",
  57. Token.ATTRIBUTE, "attribute",
  58. Token.CHILD, "child",
  59. Token.DESCENDANT, "descendant",
  60. Token.DESCENDANT_OR_SELF, "descendant-or-self",
  61. Token.FOLLOWING, "following",
  62. Token.FOLLOWING_SIBLING, "following-sibling",
  63. Token.NAMESPACE, "namespace",
  64. Token.PARENT, "parent",
  65. Token.PRECEDING, "preceding",
  66. Token.PRECEDING_SIBLING, "preceding-sibling",
  67. Token.SELF, "self",
  68. Token.COMMENT, "comment",
  69. Token.TEXT, "text",
  70. Token.PROCESSING_INSTRUCTION, "processing-instruction",
  71. Token.NODE, "node",
  72. };
  73. private const char EOL = '\0';
  74. static Tokenizer ()
  75. {
  76. for (int i = 0; i < s_rgTokenMap.Length; i += 2)
  77. s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
  78. }
  79. public Tokenizer (string strInput)
  80. {
  81. //Console.WriteLine ("Tokenizing: " + strInput);
  82. m_rgchInput = strInput;
  83. m_ich = 0;
  84. m_cch = strInput.Length;
  85. SkipWhitespace ();
  86. }
  87. private char Peek (int iOffset)
  88. {
  89. if (m_ich + iOffset>= m_cch)
  90. return EOL;
  91. return m_rgchInput [m_ich + iOffset];
  92. }
  93. private char Peek ()
  94. {
  95. return Peek (0);
  96. }
  97. private char GetChar ()
  98. {
  99. if (m_ich >= m_cch)
  100. return EOL;
  101. return m_rgchInput [m_ich++];
  102. }
  103. private char PutBack ()
  104. {
  105. if (m_ich == 0)
  106. throw new XPathException ("XPath parser returned an error status: invalid tokenizer state.");
  107. return m_rgchInput [--m_ich];
  108. }
  109. private bool SkipWhitespace () // returns trus if any whitespace was skipped
  110. {
  111. if (!IsWhitespace (Peek ()))
  112. return false;
  113. while (IsWhitespace (Peek ()))
  114. GetChar ();
  115. return true;
  116. }
  117. private int ParseNumber ()
  118. {
  119. StringBuilder sb = new StringBuilder ();
  120. while (IsDigit (Peek ()))
  121. sb.Append ((char) GetChar ());
  122. // don't handle '3.' as an error case (it is not. XPath 3.7 syntax [30])
  123. if (Peek () == '.')
  124. {
  125. sb.Append ((char) GetChar ());
  126. while (IsDigit (Peek ()))
  127. sb.Append ((char) GetChar ());
  128. }
  129. m_objToken = Double.Parse (sb.ToString (), NumberFormatInfo.InvariantInfo);
  130. return Token.NUMBER;
  131. }
  132. private int ParseLiteral ()
  133. {
  134. StringBuilder sb = new StringBuilder ();
  135. char chInit = GetChar ();
  136. char ch;
  137. while ((ch = Peek ()) != chInit)
  138. {
  139. if (ch == EOL)
  140. throw new XPathException ("unmatched "+chInit+" in expression");
  141. sb.Append ((char) GetChar ());
  142. }
  143. GetChar ();
  144. m_objToken = sb.ToString ();
  145. return Token.LITERAL;
  146. }
  147. private string ReadIdentifier ()
  148. {
  149. StringBuilder sb = new StringBuilder ();
  150. char ch = Peek ();
  151. if (!Char.IsLetter (ch) && ch != '_')
  152. return null;
  153. sb.Append ((char) GetChar ());
  154. while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
  155. sb.Append ((char) GetChar ());
  156. SkipWhitespace ();
  157. return sb.ToString ();
  158. }
  159. private int ParseIdentifier ()
  160. {
  161. string strToken = ReadIdentifier ();
  162. Object objToken = s_mapTokens [strToken];
  163. int iToken = (objToken != null) ? (int) objToken : Token.QName;
  164. m_objToken = strToken;
  165. char ch = Peek ();
  166. if (ch == ':')
  167. {
  168. if (Peek (1) == ':')
  169. {
  170. // If the two characters following an NCName (possibly
  171. // after intervening ExprWhitespace) are ::, then the
  172. // token must be recognized as an AxisName.
  173. if (objToken == null || !IsAxisName (iToken))
  174. throw new XPathException ("invalid axis name: '"+strToken+"'");
  175. return iToken;
  176. }
  177. GetChar ();
  178. SkipWhitespace ();
  179. ch = Peek ();
  180. if (ch == '*')
  181. {
  182. GetChar ();
  183. m_objToken = new XmlQualifiedName ("", strToken);
  184. return Token.QName;
  185. }
  186. string strToken2 = ReadIdentifier ();
  187. if (strToken2 == null)
  188. throw new XPathException ("invalid QName: "+strToken+":"+(char)ch);
  189. ch = Peek ();
  190. m_objToken = new XmlQualifiedName (strToken2, strToken);
  191. if (ch == '(')
  192. return Token.FUNCTION_NAME;
  193. return Token.QName;
  194. }
  195. // If there is a preceding token and the preceding
  196. // token is not one of @, ::, (, [, , or an Operator,
  197. // then a * must be recognized as a MultiplyOperator
  198. // and an NCName must be recognized as an OperatorName.
  199. if (!IsFirstToken && !m_fPrevWasOperator)
  200. {
  201. if (objToken == null || !IsOperatorName (iToken))
  202. throw new XPathException ("invalid operator name: '"+strToken+"'");
  203. return iToken;
  204. }
  205. if (ch == '(')
  206. {
  207. // If the character following an NCName (possibly
  208. // after intervening ExprWhitespace) is (, then the
  209. // token must be recognized as a NodeType or a FunctionName.
  210. if (objToken == null)
  211. {
  212. m_objToken = new XmlQualifiedName (strToken, "");
  213. return Token.FUNCTION_NAME;
  214. }
  215. if (IsNodeType (iToken))
  216. return iToken;
  217. throw new XPathException ("invalid function name: '"+strToken+"'");
  218. }
  219. m_objToken = new XmlQualifiedName (strToken, "");
  220. return Token.QName;
  221. }
  222. private static bool IsWhitespace (char ch)
  223. {
  224. // return Char.IsWhiteSpace (ch);
  225. return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
  226. }
  227. private static bool IsDigit (char ch)
  228. {
  229. // return Char.IsDigit (ch);
  230. return ch >= '0' && ch <= '9';
  231. }
  232. int ParseToken ()
  233. {
  234. char ch = Peek ();
  235. switch (ch)
  236. {
  237. case EOL:
  238. return Token.EOF;
  239. case '/':
  240. m_fThisIsOperator = true;
  241. GetChar ();
  242. if (Peek () == '/')
  243. {
  244. GetChar ();
  245. return Token.SLASH2;
  246. }
  247. return Token.SLASH;
  248. case '.':
  249. GetChar ();
  250. if (Peek () == '.')
  251. {
  252. GetChar ();
  253. return Token.DOT2;
  254. }
  255. else if (IsDigit (Peek ()))
  256. {
  257. PutBack ();
  258. return ParseNumber ();
  259. }
  260. return Token.DOT;
  261. case ':':
  262. GetChar ();
  263. if (Peek () == ':')
  264. {
  265. m_fThisIsOperator = true;
  266. GetChar ();
  267. return Token.COLON2;
  268. }
  269. return Token.ERROR;
  270. case ',':
  271. m_fThisIsOperator = true;
  272. GetChar ();
  273. return Token.COMMA;
  274. case '@':
  275. m_fThisIsOperator = true;
  276. GetChar ();
  277. return Token.AT;
  278. case '[':
  279. m_fThisIsOperator = true;
  280. GetChar ();
  281. return Token.BRACKET_OPEN;
  282. case ']':
  283. GetChar ();
  284. return Token.BRACKET_CLOSE;
  285. case '(':
  286. m_fThisIsOperator = true;
  287. GetChar ();
  288. return Token.PAREN_OPEN;
  289. case ')':
  290. GetChar ();
  291. return Token.PAREN_CLOSE;
  292. case '+':
  293. m_fThisIsOperator = true;
  294. GetChar ();
  295. return Token.PLUS;
  296. case '-':
  297. m_fThisIsOperator = true;
  298. GetChar ();
  299. return Token.MINUS;
  300. case '*':
  301. GetChar ();
  302. if (!IsFirstToken && !m_fPrevWasOperator)
  303. {
  304. m_fThisIsOperator = true;
  305. return Token.MULTIPLY;
  306. }
  307. return Token.ASTERISK;
  308. case '$':
  309. GetChar ();
  310. m_fThisIsOperator = true;
  311. return Token.DOLLAR;
  312. case '|':
  313. m_fThisIsOperator = true;
  314. GetChar ();
  315. return Token.BAR;
  316. case '=':
  317. m_fThisIsOperator = true;
  318. GetChar ();
  319. return Token.EQ;
  320. case '!':
  321. GetChar ();
  322. if (Peek () == '=')
  323. {
  324. m_fThisIsOperator = true;
  325. GetChar ();
  326. return Token.NE;
  327. }
  328. break;
  329. case '>':
  330. m_fThisIsOperator = true;
  331. GetChar ();
  332. if (Peek () == '=')
  333. {
  334. GetChar ();
  335. return Token.GE;
  336. }
  337. return Token.GT;
  338. case '<':
  339. m_fThisIsOperator = true;
  340. GetChar ();
  341. if (Peek () == '=')
  342. {
  343. GetChar ();
  344. return Token.LE;
  345. }
  346. return Token.LT;
  347. case '\'':
  348. return ParseLiteral ();
  349. case '\"':
  350. return ParseLiteral ();
  351. default:
  352. if (IsDigit (ch))
  353. {
  354. return ParseNumber ();
  355. }
  356. else if (Char.IsLetter (ch) || ch == '_') // NCName
  357. {
  358. int iToken = ParseIdentifier ();
  359. if (IsOperatorName (iToken))
  360. m_fThisIsOperator = true;
  361. return iToken;
  362. }
  363. break;
  364. }
  365. throw new XPathException ("invalid token: '"+ch+"'");
  366. }
  367. ///////////////////////////
  368. // yyParser.yyInput methods
  369. ///////////////////////////
  370. /** move on to next token.
  371. @return false if positioned beyond tokens.
  372. @throws IOException on input error.
  373. */
  374. public bool advance ()
  375. {
  376. m_fThisIsOperator = false;
  377. m_objToken = null;
  378. m_iToken = ParseToken ();
  379. SkipWhitespace ();
  380. m_iTokenPrev = m_iToken;
  381. m_fPrevWasOperator = m_fThisIsOperator;
  382. return (m_iToken != Token.EOF);
  383. }
  384. /** classifies current token.
  385. Should not be called if advance() returned false.
  386. @return current %token or single character.
  387. */
  388. public int token ()
  389. {
  390. return m_iToken;
  391. }
  392. /** associated with current token.
  393. Should not be called if advance() returned false.
  394. @return value for token().
  395. */
  396. public Object value ()
  397. {
  398. return m_objToken;
  399. }
  400. private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; } }
  401. private bool IsNodeType (int iToken)
  402. {
  403. switch (iToken)
  404. {
  405. case Token.COMMENT:
  406. case Token.TEXT:
  407. case Token.PROCESSING_INSTRUCTION:
  408. case Token.NODE:
  409. return true;
  410. default:
  411. return false;
  412. }
  413. }
  414. private bool IsOperatorName (int iToken)
  415. {
  416. switch (iToken)
  417. {
  418. case Token.AND:
  419. case Token.OR:
  420. case Token.MOD:
  421. case Token.DIV:
  422. return true;
  423. default:
  424. return false;
  425. }
  426. }
  427. private bool IsAxisName (int iToken)
  428. {
  429. switch (iToken)
  430. {
  431. case Token.ATTRIBUTE:
  432. case Token.ANCESTOR:
  433. case Token.ANCESTOR_OR_SELF:
  434. case Token.CHILD:
  435. case Token.DESCENDANT:
  436. case Token.DESCENDANT_OR_SELF:
  437. case Token.FOLLOWING:
  438. case Token.FOLLOWING_SIBLING:
  439. case Token.NAMESPACE:
  440. case Token.PARENT:
  441. case Token.PRECEDING:
  442. case Token.PRECEDING_SIBLING:
  443. case Token.SELF:
  444. return true;
  445. default:
  446. return false;
  447. }
  448. }
  449. }
  450. }