Tokenizer.cs 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. //
  2. // System.Xml.XPath.Tokenizer
  3. //
  4. // Author:
  5. // Piers Haken ([email protected])
  6. //
  7. // (C) 2002 Piers Haken
  8. //
  9. using System;
  10. using System.Globalization;
  11. using System.IO;
  12. using System.Text;
  13. using System.Collections;
  14. using Mono.Xml.XPath;
  15. using Mono.Xml.XPath.yyParser;
  16. namespace System.Xml.XPath
  17. {
  18. internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
  19. {
  20. private char [] m_rgchInput;
  21. private int m_ich;
  22. private int m_cch;
  23. private int m_iToken;
  24. private int m_iTokenPrev = Token.EOF;
  25. private Object m_objToken;
  26. private bool m_fPrevWasOperator = false;
  27. private bool m_fThisIsOperator = false;
  28. private static readonly Hashtable s_mapTokens = new Hashtable ();
  29. private static readonly Object [] s_rgTokenMap =
  30. {
  31. Token.AND, "and",
  32. Token.OR, "or",
  33. Token.DIV, "div",
  34. Token.MOD, "mod",
  35. Token.ANCESTOR, "ancestor",
  36. Token.ANCESTOR_OR_SELF, "ancestor-or-self",
  37. Token.ATTRIBUTE, "attribute",
  38. Token.CHILD, "child",
  39. Token.DESCENDANT, "descendant",
  40. Token.DESCENDANT_OR_SELF, "descendant-or-self",
  41. Token.FOLLOWING, "following",
  42. Token.FOLLOWING_SIBLING, "following-sibling",
  43. Token.NAMESPACE, "namespace",
  44. Token.PARENT, "parent",
  45. Token.PRECEDING, "preceding",
  46. Token.PRECEDING_SIBLING, "preceding-sibling",
  47. Token.SELF, "self",
  48. Token.COMMENT, "comment",
  49. Token.TEXT, "text",
  50. Token.PROCESSING_INSTRUCTION, "processing-instruction",
  51. Token.NODE, "node",
  52. };
  53. private const char EOL = '\0';
  54. static Tokenizer ()
  55. {
  56. for (int i = 0; i < s_rgTokenMap.Length; i += 2)
  57. s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
  58. }
  59. public Tokenizer (string strInput)
  60. {
  61. //Console.WriteLine ("Tokenizing: " + strInput);
  62. m_rgchInput = strInput.ToCharArray ();
  63. m_ich = 0;
  64. m_cch = strInput.Length;
  65. SkipWhitespace ();
  66. }
  67. private char Peek (int iOffset)
  68. {
  69. if (m_ich + iOffset>= m_cch)
  70. return EOL;
  71. return m_rgchInput [m_ich + iOffset];
  72. }
  73. private char Peek ()
  74. {
  75. return Peek (0);
  76. }
  77. private char GetChar ()
  78. {
  79. if (m_ich >= m_cch)
  80. return EOL;
  81. return m_rgchInput [m_ich++];
  82. }
  83. private char PutBack ()
  84. {
  85. if (m_ich == 0)
  86. throw new XPathException ("XPath parser returned an error status: invalid tokenizer state.");
  87. return m_rgchInput [--m_ich];
  88. }
  89. private bool SkipWhitespace () // returns trus if any whitespace was skipped
  90. {
  91. if (!IsWhitespace (Peek ()))
  92. return false;
  93. while (IsWhitespace (Peek ()))
  94. GetChar ();
  95. return true;
  96. }
  97. private int ParseNumber ()
  98. {
  99. StringBuilder sb = new StringBuilder ();
  100. while (IsDigit (Peek ()))
  101. sb.Append ((char) GetChar ());
  102. // don't handle '3.' as an error case (it is not. XPath 3.7 syntax [30])
  103. if (Peek () == '.')
  104. {
  105. sb.Append ((char) GetChar ());
  106. while (IsDigit (Peek ()))
  107. sb.Append ((char) GetChar ());
  108. }
  109. m_objToken = Double.Parse (sb.ToString (), NumberFormatInfo.InvariantInfo);
  110. return Token.NUMBER;
  111. }
  112. private int ParseLiteral ()
  113. {
  114. StringBuilder sb = new StringBuilder ();
  115. char chInit = GetChar ();
  116. char ch;
  117. while ((ch = Peek ()) != chInit)
  118. {
  119. if (ch == EOL)
  120. throw new XPathException ("unmatched "+chInit+" in expression");
  121. sb.Append ((char) GetChar ());
  122. }
  123. GetChar ();
  124. m_objToken = sb.ToString ();
  125. return Token.LITERAL;
  126. }
  127. private string ReadIdentifier ()
  128. {
  129. StringBuilder sb = new StringBuilder ();
  130. char ch = Peek ();
  131. if (!Char.IsLetter (ch) && ch != '_')
  132. return null;
  133. sb.Append ((char) GetChar ());
  134. while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
  135. sb.Append ((char) GetChar ());
  136. SkipWhitespace ();
  137. return sb.ToString ();
  138. }
  139. private int ParseIdentifier ()
  140. {
  141. string strToken = ReadIdentifier ();
  142. Object objToken = s_mapTokens [strToken];
  143. int iToken = (objToken != null) ? (int) objToken : Token.QName;
  144. m_objToken = strToken;
  145. char ch = Peek ();
  146. if (ch == ':')
  147. {
  148. if (Peek (1) == ':')
  149. {
  150. // If the two characters following an NCName (possibly
  151. // after intervening ExprWhitespace) are ::, then the
  152. // token must be recognized as an AxisName.
  153. if (objToken == null || !IsAxisName (iToken))
  154. throw new XPathException ("invalid axis name: '"+strToken+"'");
  155. return iToken;
  156. }
  157. GetChar ();
  158. SkipWhitespace ();
  159. ch = Peek ();
  160. if (ch == '*')
  161. {
  162. GetChar ();
  163. m_objToken = new XmlQualifiedName ("", strToken);
  164. return Token.QName;
  165. }
  166. string strToken2 = ReadIdentifier ();
  167. if (strToken2 == null)
  168. throw new XPathException ("invalid QName: "+strToken+":"+(char)ch);
  169. ch = Peek ();
  170. m_objToken = new XmlQualifiedName (strToken2, strToken);
  171. if (ch == '(')
  172. return Token.FUNCTION_NAME;
  173. return Token.QName;
  174. }
  175. // If there is a preceding token and the preceding
  176. // token is not one of @, ::, (, [, , or an Operator,
  177. // then a * must be recognized as a MultiplyOperator
  178. // and an NCName must be recognized as an OperatorName.
  179. if (!IsFirstToken && !m_fPrevWasOperator)
  180. {
  181. if (objToken == null || !IsOperatorName (iToken))
  182. throw new XPathException ("invalid operator name: '"+strToken+"'");
  183. return iToken;
  184. }
  185. if (ch == '(')
  186. {
  187. // If the character following an NCName (possibly
  188. // after intervening ExprWhitespace) is (, then the
  189. // token must be recognized as a NodeType or a FunctionName.
  190. if (objToken == null)
  191. {
  192. m_objToken = new XmlQualifiedName (strToken, "");
  193. return Token.FUNCTION_NAME;
  194. }
  195. if (IsNodeType (iToken))
  196. return iToken;
  197. throw new XPathException ("invalid function name: '"+strToken+"'");
  198. }
  199. m_objToken = new XmlQualifiedName (strToken, "");
  200. return Token.QName;
  201. }
  202. private static bool IsWhitespace (char ch)
  203. {
  204. // return Char.IsWhiteSpace (ch);
  205. return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
  206. }
  207. private static bool IsDigit (char ch)
  208. {
  209. // return Char.IsDigit (ch);
  210. return ch >= '0' && ch <= '9';
  211. }
  212. int ParseToken ()
  213. {
  214. char ch = Peek ();
  215. switch (ch)
  216. {
  217. case EOL:
  218. return Token.EOF;
  219. case '/':
  220. m_fThisIsOperator = true;
  221. GetChar ();
  222. if (Peek () == '/')
  223. {
  224. GetChar ();
  225. return Token.SLASH2;
  226. }
  227. return Token.SLASH;
  228. case '.':
  229. GetChar ();
  230. if (Peek () == '.')
  231. {
  232. GetChar ();
  233. return Token.DOT2;
  234. }
  235. else if (IsDigit (Peek ()))
  236. {
  237. PutBack ();
  238. return ParseNumber ();
  239. }
  240. return Token.DOT;
  241. case ':':
  242. GetChar ();
  243. if (Peek () == ':')
  244. {
  245. m_fThisIsOperator = true;
  246. GetChar ();
  247. return Token.COLON2;
  248. }
  249. return Token.ERROR;
  250. case ',':
  251. m_fThisIsOperator = true;
  252. GetChar ();
  253. return Token.COMMA;
  254. case '@':
  255. m_fThisIsOperator = true;
  256. GetChar ();
  257. return Token.AT;
  258. case '[':
  259. m_fThisIsOperator = true;
  260. GetChar ();
  261. return Token.BRACKET_OPEN;
  262. case ']':
  263. GetChar ();
  264. return Token.BRACKET_CLOSE;
  265. case '(':
  266. m_fThisIsOperator = true;
  267. GetChar ();
  268. return Token.PAREN_OPEN;
  269. case ')':
  270. GetChar ();
  271. return Token.PAREN_CLOSE;
  272. case '+':
  273. m_fThisIsOperator = true;
  274. GetChar ();
  275. return Token.PLUS;
  276. case '-':
  277. m_fThisIsOperator = true;
  278. GetChar ();
  279. return Token.MINUS;
  280. case '*':
  281. GetChar ();
  282. if (!IsFirstToken && !m_fPrevWasOperator)
  283. {
  284. m_fThisIsOperator = true;
  285. return Token.MULTIPLY;
  286. }
  287. return Token.ASTERISK;
  288. case '$':
  289. GetChar ();
  290. m_fThisIsOperator = true;
  291. return Token.DOLLAR;
  292. case '|':
  293. m_fThisIsOperator = true;
  294. GetChar ();
  295. return Token.BAR;
  296. case '=':
  297. m_fThisIsOperator = true;
  298. GetChar ();
  299. return Token.EQ;
  300. case '!':
  301. GetChar ();
  302. if (Peek () == '=')
  303. {
  304. m_fThisIsOperator = true;
  305. GetChar ();
  306. return Token.NE;
  307. }
  308. break;
  309. case '>':
  310. m_fThisIsOperator = true;
  311. GetChar ();
  312. if (Peek () == '=')
  313. {
  314. GetChar ();
  315. return Token.GE;
  316. }
  317. return Token.GT;
  318. case '<':
  319. m_fThisIsOperator = true;
  320. GetChar ();
  321. if (Peek () == '=')
  322. {
  323. GetChar ();
  324. return Token.LE;
  325. }
  326. return Token.LT;
  327. case '\'':
  328. return ParseLiteral ();
  329. case '\"':
  330. return ParseLiteral ();
  331. default:
  332. if (IsDigit (ch))
  333. {
  334. return ParseNumber ();
  335. }
  336. else if (Char.IsLetter (ch) || ch == '_') // NCName
  337. {
  338. int iToken = ParseIdentifier ();
  339. if (IsOperatorName (iToken))
  340. m_fThisIsOperator = true;
  341. return iToken;
  342. }
  343. break;
  344. }
  345. throw new XPathException ("invalid token: '"+ch+"'");
  346. }
  347. ///////////////////////////
  348. // yyParser.yyInput methods
  349. ///////////////////////////
  350. /** move on to next token.
  351. @return false if positioned beyond tokens.
  352. @throws IOException on input error.
  353. */
  354. public bool advance ()
  355. {
  356. m_fThisIsOperator = false;
  357. m_objToken = null;
  358. m_iToken = ParseToken ();
  359. bool fWhitespace = SkipWhitespace ();
  360. m_iTokenPrev = m_iToken;
  361. m_fPrevWasOperator = m_fThisIsOperator;
  362. return (m_iToken != Token.EOF);
  363. }
  364. /** classifies current token.
  365. Should not be called if advance() returned false.
  366. @return current %token or single character.
  367. */
  368. public int token ()
  369. {
  370. return m_iToken;
  371. }
  372. /** associated with current token.
  373. Should not be called if advance() returned false.
  374. @return value for token().
  375. */
  376. public Object value ()
  377. {
  378. return m_objToken;
  379. }
  380. private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; } }
  381. private bool IsNodeType (int iToken)
  382. {
  383. switch (iToken)
  384. {
  385. case Token.COMMENT:
  386. case Token.TEXT:
  387. case Token.PROCESSING_INSTRUCTION:
  388. case Token.NODE:
  389. return true;
  390. default:
  391. return false;
  392. }
  393. }
  394. private bool IsOperatorName (int iToken)
  395. {
  396. switch (iToken)
  397. {
  398. case Token.AND:
  399. case Token.OR:
  400. case Token.MOD:
  401. case Token.DIV:
  402. return true;
  403. default:
  404. return false;
  405. }
  406. }
  407. private bool IsAxisName (int iToken)
  408. {
  409. switch (iToken)
  410. {
  411. case Token.ATTRIBUTE:
  412. case Token.ANCESTOR:
  413. case Token.ANCESTOR_OR_SELF:
  414. case Token.CHILD:
  415. case Token.DESCENDANT:
  416. case Token.DESCENDANT_OR_SELF:
  417. case Token.FOLLOWING:
  418. case Token.FOLLOWING_SIBLING:
  419. case Token.NAMESPACE:
  420. case Token.PARENT:
  421. case Token.PRECEDING:
  422. case Token.PRECEDING_SIBLING:
  423. case Token.SELF:
  424. return true;
  425. default:
  426. return false;
  427. }
  428. }
  429. }
  430. }