Tokenizer.cs 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. //
  2. // System.Xml.XPath.Tokenizer
  3. //
  4. // Author:
  5. // Piers Haken ([email protected])
  6. //
  7. // (C) 2002 Piers Haken
  8. //
  9. using System;
  10. using System.IO;
  11. using System.Text;
  12. using System.Collections;
  13. using Mono.Xml.XPath;
  14. using Mono.Xml.XPath.yyParser;
  15. namespace System.Xml.XPath
  16. {
  17. internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
  18. {
  19. private char [] m_rgchInput;
  20. private int m_ich;
  21. private int m_cch;
  22. private int m_iToken;
  23. private int m_iTokenPrev = Token.EOF;
  24. private Object m_objToken;
  25. private bool m_fPrevWasOperator = false;
  26. private bool m_fThisIsOperator = false;
  27. private static readonly Hashtable s_mapTokens = new Hashtable ();
  28. private static readonly Object [] s_rgTokenMap =
  29. {
  30. Token.AND, "and",
  31. Token.OR, "or",
  32. Token.DIV, "div",
  33. Token.MOD, "mod",
  34. Token.ANCESTOR, "ancestor",
  35. Token.ANCESTOR_OR_SELF, "ancestor-or-self",
  36. Token.ATTRIBUTE, "attribute",
  37. Token.CHILD, "child",
  38. Token.DESCENDANT, "descendant",
  39. Token.DESCENDANT_OR_SELF, "descendant-or-self",
  40. Token.FOLLOWING, "following",
  41. Token.FOLLOWING_SIBLING, "following-sibling",
  42. Token.NAMESPACE, "namespace",
  43. Token.PARENT, "parent",
  44. Token.PRECEDING, "preceding",
  45. Token.PRECEDING_SIBLING, "preceding-sibling",
  46. Token.SELF, "self",
  47. Token.COMMENT, "comment",
  48. Token.TEXT, "text",
  49. Token.PROCESSING_INSTRUCTION, "processing-instruction",
  50. Token.NODE, "node",
  51. };
  52. private const char EOL = '\0';
  53. static Tokenizer ()
  54. {
  55. for (int i = 0; i < s_rgTokenMap.Length; i += 2)
  56. s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
  57. }
  58. public Tokenizer (string strInput)
  59. {
  60. m_rgchInput = strInput.ToCharArray ();
  61. m_ich = 0;
  62. m_cch = strInput.Length;
  63. SkipWhitespace ();
  64. }
  65. private char Peek (int iOffset)
  66. {
  67. if (m_ich + iOffset>= m_cch)
  68. return EOL;
  69. return m_rgchInput [m_ich + iOffset];
  70. }
  71. private char Peek ()
  72. {
  73. return Peek (0);
  74. }
  75. private char GetChar ()
  76. {
  77. if (m_ich >= m_cch)
  78. return EOL;
  79. return m_rgchInput [m_ich++];
  80. }
  81. private char PutBack ()
  82. {
  83. if (m_ich == 0)
  84. throw new XPathException ("invalid tokenizer state"); // TODO: better description
  85. return m_rgchInput [--m_ich];
  86. }
  87. private bool SkipWhitespace () // returns trus if any whitespace was skipped
  88. {
  89. if (!IsWhitespace (Peek ()))
  90. return false;
  91. while (IsWhitespace (Peek ()))
  92. GetChar ();
  93. return true;
  94. }
  95. [MonoTODO]
  96. private int ParseNumber ()
  97. {
  98. StringBuilder sb = new StringBuilder ();
  99. while (IsDigit (Peek ()))
  100. sb.Append ((char) GetChar ());
  101. // TODO: doesn't handle '3.' error case
  102. if (Peek () == '.')
  103. {
  104. sb.Append ((char) GetChar ());
  105. while (IsDigit (Peek ()))
  106. sb.Append ((char) GetChar ());
  107. }
  108. m_objToken = Double.Parse (sb.ToString ());
  109. return Token.NUMBER;
  110. }
  111. private int ParseLiteral ()
  112. {
  113. StringBuilder sb = new StringBuilder ();
  114. char chInit = GetChar ();
  115. char ch;
  116. while ((ch = Peek ()) != chInit)
  117. {
  118. if (ch == EOL)
  119. throw new XPathException ("unmatched "+chInit+" in expression");
  120. sb.Append ((char) GetChar ());
  121. }
  122. GetChar ();
  123. m_objToken = sb.ToString ();
  124. return Token.LITERAL;
  125. }
  126. private int ParseIdentifier ()
  127. {
  128. StringBuilder sb = new StringBuilder ();
  129. char ch;
  130. while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
  131. sb.Append ((char) GetChar ());
  132. String strToken = sb.ToString ();
  133. Object objToken = s_mapTokens [strToken];
  134. int iToken = (objToken != null) ? (int) objToken : Token.NCName;
  135. m_objToken = strToken;
  136. if (!IsFirstToken)
  137. {
  138. // the second half of a QName is always an NCName
  139. if (m_iTokenPrev == Token.COLON)
  140. return Token.NCName;
  141. // If there is a preceding token and the preceding
  142. // token is not one of @, ::, (, [, , or an Operator,
  143. // then a * must be recognized as a MultiplyOperator
  144. // and an NCName must be recognized as an OperatorName.
  145. if (!m_fPrevWasOperator)
  146. {
  147. if (objToken == null || !IsOperatorName (iToken))
  148. throw new XPathException ("invalid operator name: '"+strToken+"'");
  149. return iToken;
  150. }
  151. }
  152. SkipWhitespace ();
  153. ch = Peek ();
  154. if (ch == '(')
  155. {
  156. // If the character following an NCName (possibly
  157. // after intervening ExprWhitespace) is (, then the
  158. // token must be recognized as a NodeType or a FunctionName.
  159. if (objToken == null)
  160. return Token.FUNCTION_NAME;
  161. if (IsNodeType (iToken))
  162. return iToken;
  163. throw new XPathException ("invalid function name: '"+strToken+"'");
  164. }
  165. else if (ch == ':' && Peek (1) == ':')
  166. {
  167. // If the two characters following an NCName (possibly
  168. // after intervening ExprWhitespace) are ::, then the
  169. // token must be recognized as an AxisName.
  170. if (objToken == null || !IsAxisName (iToken))
  171. throw new XPathException ("invalid axis name: '"+strToken+"'");
  172. return iToken;
  173. }
  174. // Otherwise, the token must not be recognized as a
  175. // MultiplyOperator, an OperatorName, a NodeType,
  176. // a FunctionName, or an AxisName.
  177. return Token.NCName;
  178. }
  179. private static bool IsWhitespace (char ch)
  180. {
  181. // return Char.IsWhiteSpace (ch);
  182. return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
  183. }
  184. private static bool IsDigit (char ch)
  185. {
  186. // return Char.IsDigit (ch);
  187. return ch >= '0' && ch <= '9';
  188. }
  189. int ParseToken ()
  190. {
  191. char ch = Peek ();
  192. switch (ch)
  193. {
  194. case EOL:
  195. return Token.EOF;
  196. case '/':
  197. m_fThisIsOperator = true;
  198. GetChar ();
  199. if (Peek () == '/')
  200. {
  201. GetChar ();
  202. return Token.SLASH2;
  203. }
  204. return Token.SLASH;
  205. case '.':
  206. GetChar ();
  207. if (Peek () == '.')
  208. {
  209. GetChar ();
  210. return Token.DOT2;
  211. }
  212. else if (IsDigit (Peek ()))
  213. {
  214. PutBack ();
  215. return ParseNumber ();
  216. }
  217. return Token.DOT;
  218. case ':':
  219. GetChar ();
  220. if (Peek () == ':')
  221. {
  222. m_fThisIsOperator = true;
  223. GetChar ();
  224. return Token.COLON2;
  225. }
  226. return Token.COLON;
  227. case ',':
  228. m_fThisIsOperator = true;
  229. GetChar ();
  230. return Token.COMMA;
  231. case '@':
  232. m_fThisIsOperator = true;
  233. GetChar ();
  234. return Token.AT;
  235. case '[':
  236. m_fThisIsOperator = true;
  237. GetChar ();
  238. return Token.BRACKET_OPEN;
  239. case ']':
  240. GetChar ();
  241. return Token.BRACKET_CLOSE;
  242. case '(':
  243. m_fThisIsOperator = true;
  244. GetChar ();
  245. return Token.PAREN_OPEN;
  246. case ')':
  247. GetChar ();
  248. return Token.PAREN_CLOSE;
  249. case '+':
  250. m_fThisIsOperator = true;
  251. GetChar ();
  252. return Token.PLUS;
  253. case '-':
  254. m_fThisIsOperator = true;
  255. GetChar ();
  256. return Token.MINUS;
  257. case '*':
  258. GetChar ();
  259. if (!IsFirstToken && !m_fPrevWasOperator)
  260. {
  261. m_fThisIsOperator = true;
  262. return Token.MULTIPLY;
  263. }
  264. return Token.ASTERISK;
  265. case '$':
  266. GetChar ();
  267. return Token.DOLLAR;
  268. case '|':
  269. m_fThisIsOperator = true;
  270. GetChar ();
  271. return Token.BAR;
  272. case '=':
  273. m_fThisIsOperator = true;
  274. GetChar ();
  275. return Token.EQ;
  276. case '!':
  277. GetChar ();
  278. if (Peek () == '=')
  279. {
  280. m_fThisIsOperator = true;
  281. GetChar ();
  282. return Token.NE;
  283. }
  284. break;
  285. case '>':
  286. m_fThisIsOperator = true;
  287. GetChar ();
  288. if (Peek () == '=')
  289. {
  290. GetChar ();
  291. return Token.GE;
  292. }
  293. return Token.GT;
  294. case '<':
  295. m_fThisIsOperator = true;
  296. GetChar ();
  297. if (Peek () == '=')
  298. {
  299. GetChar ();
  300. return Token.LE;
  301. }
  302. return Token.LT;
  303. case '\'':
  304. return ParseLiteral ();
  305. case '\"':
  306. return ParseLiteral ();
  307. default:
  308. if (IsDigit (ch))
  309. {
  310. return ParseNumber ();
  311. }
  312. else if (Char.IsLetter (ch) || ch == '_') // NCName
  313. {
  314. int iToken = ParseIdentifier ();
  315. if (IsOperatorName (iToken))
  316. m_fThisIsOperator = true;
  317. return iToken;
  318. }
  319. break;
  320. }
  321. throw new XPathException ("invalid token: '"+ch+"'");
  322. }
  323. ///////////////////////////
  324. // yyParser.yyInput methods
  325. ///////////////////////////
  326. /** move on to next token.
  327. @return false if positioned beyond tokens.
  328. @throws IOException on input error.
  329. */
  330. public bool advance ()
  331. {
  332. m_fThisIsOperator = false;
  333. m_objToken = null;
  334. m_iToken = ParseToken ();
  335. bool fWhitespace = SkipWhitespace ();
  336. m_iTokenPrev = m_iToken;
  337. m_fPrevWasOperator = m_fThisIsOperator;
  338. return (m_iToken != Token.EOF);
  339. }
  340. /** classifies current token.
  341. Should not be called if advance() returned false.
  342. @return current %token or single character.
  343. */
  344. public int token ()
  345. {
  346. return m_iToken;
  347. }
  348. /** associated with current token.
  349. Should not be called if advance() returned false.
  350. @return value for token().
  351. */
  352. public Object value ()
  353. {
  354. return m_objToken;
  355. }
  356. private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; } }
  357. private bool IsNodeType (int iToken)
  358. {
  359. switch (iToken)
  360. {
  361. case Token.COMMENT:
  362. case Token.TEXT:
  363. case Token.PROCESSING_INSTRUCTION:
  364. case Token.NODE:
  365. return true;
  366. default:
  367. return false;
  368. }
  369. }
  370. private bool IsOperatorName (int iToken)
  371. {
  372. switch (iToken)
  373. {
  374. case Token.AND:
  375. case Token.OR:
  376. case Token.MOD:
  377. case Token.DIV:
  378. return true;
  379. default:
  380. return false;
  381. }
  382. }
  383. private bool IsAxisName (int iToken)
  384. {
  385. switch (iToken)
  386. {
  387. case Token.ATTRIBUTE:
  388. case Token.ANCESTOR:
  389. case Token.ANCESTOR_OR_SELF:
  390. case Token.CHILD:
  391. case Token.DESCENDANT:
  392. case Token.DESCENDANT_OR_SELF:
  393. case Token.FOLLOWING:
  394. case Token.FOLLOWING_SIBLING:
  395. case Token.NAMESPACE:
  396. case Token.PARENT:
  397. case Token.PRECEDING:
  398. case Token.PRECEDING_SIBLING:
  399. case Token.SELF:
  400. return true;
  401. default:
  402. return false;
  403. }
  404. }
  405. }
  406. }