Tokenizer.cs 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. //
  2. // System.Xml.XPath.Tokenizer
  3. //
  4. // Author:
  5. // Piers Haken ([email protected])
  6. //
  7. // (C) 2002 Piers Haken
  8. //
  9. using System;
  10. using System.IO;
  11. using System.Text;
  12. using System.Collections;
  13. using Mono.Xml.XPath;
  14. using Mono.Xml.XPath.yyParser;
  15. namespace System.Xml.XPath
  16. {
  17. internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
  18. {
  19. private char [] m_rgchInput;
  20. private int m_ich;
  21. private int m_cch;
  22. // private System.IO.StreamReader m_input;
  23. private int m_iToken;
  24. private Object m_objToken;
  25. private static Hashtable m_mapTokens = new Hashtable ();
  26. private static readonly Object [] rgTokenMap =
  27. {
  28. Token.AND, "and",
  29. Token.OR, "or",
  30. Token.DIV, "div",
  31. Token.MOD, "mod",
  32. Token.ANCESTOR, "ancestor",
  33. Token.ANCESTOR_OR_SELF, "ancestor-or-self",
  34. Token.ATTRIBUTE, "attribute",
  35. Token.CHILD, "child",
  36. Token.DESCENDANT, "descendant",
  37. Token.DESCENDANT_OR_SELF, "descendant-or-self",
  38. Token.FOLLOWING, "following",
  39. Token.FOLLOWING_SIBLING, "following-sibling",
  40. Token.NAMESPACE, "namespace",
  41. Token.PARENT, "parent",
  42. Token.PRECEDING, "preceding",
  43. Token.PRECEDING_SIBLING, "preceding-sibling",
  44. Token.SELF, "self",
  45. Token.COMMENT, "comment",
  46. Token.TEXT, "text",
  47. Token.PROCESSING_INSTRUCTION, "processing-instruction",
  48. Token.NODE, "node",
  49. };
  50. static Tokenizer ()
  51. {
  52. for (int i = 0; i < rgTokenMap.Length; i += 2)
  53. m_mapTokens.Add (rgTokenMap [i + 1], rgTokenMap [i]);
  54. }
  55. public Tokenizer (string strInput)
  56. {
  57. m_rgchInput = strInput.ToCharArray ();
  58. m_ich = 0;
  59. m_cch = strInput.Length;
  60. SkipWhitespace ();
  61. }
  62. private int Peek ()
  63. {
  64. if (m_ich >= m_cch)
  65. return -1;
  66. return m_rgchInput [m_ich];
  67. }
  68. private int GetChar ()
  69. {
  70. if (m_ich >= m_cch)
  71. return -1;
  72. return m_rgchInput [m_ich++];
  73. }
  74. private int PutBack ()
  75. {
  76. if (m_ich == 0)
  77. throw new XPathException ("invalid tokenizer state"); // TODO: better description
  78. return m_rgchInput [--m_ich];
  79. }
  80. private void SkipWhitespace ()
  81. {
  82. while (IsWhitespace (Peek ()))
  83. GetChar ();
  84. }
  85. [MonoTODO]
  86. private int ParseNumber ()
  87. {
  88. StringBuilder sb = new StringBuilder ();
  89. while (IsDigit (Peek ()))
  90. sb.Append ((char) GetChar ());
  91. // TODO: doesn't handle '3.' error case
  92. if (Peek () == '.')
  93. {
  94. sb.Append ((char) GetChar ());
  95. while (IsDigit (Peek ()))
  96. sb.Append ((char) GetChar ());
  97. }
  98. m_objToken = Double.Parse (sb.ToString ());
  99. return Token.NUMBER;
  100. }
  101. private int ParseLiteral ()
  102. {
  103. StringBuilder sb = new StringBuilder ();
  104. int chInit = GetChar ();
  105. int ch;
  106. while ((ch = Peek ()) != chInit)
  107. {
  108. if (ch == -1)
  109. return Token.ERROR;
  110. sb.Append ((char) GetChar ());
  111. }
  112. GetChar ();
  113. m_objToken = sb.ToString ();
  114. return Token.LITERAL;
  115. }
  116. private int ParseIdentifier ()
  117. {
  118. StringBuilder sb = new StringBuilder ();
  119. while (true)
  120. {
  121. int ch = Peek ();
  122. if (ch == '_' || ch == '-' ||
  123. (ch >= 'a' && ch <= 'z') ||
  124. (ch >= 'A' && ch <= 'Z'))
  125. {
  126. sb.Append ((char) GetChar ());
  127. }
  128. else
  129. break;
  130. }
  131. String strToken = sb.ToString ();
  132. Object objToken = m_mapTokens [strToken];
  133. if (objToken != null)
  134. {
  135. return (int) objToken;
  136. }
  137. else
  138. {
  139. m_objToken = strToken;
  140. SkipWhitespace ();
  141. if (Peek () == '(')
  142. return Token.FUNCTION_NAME;
  143. return Token.NCName;
  144. }
  145. }
  146. private static bool IsWhitespace (int ch)
  147. {
  148. return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
  149. }
  150. private static bool IsDigit (int ch)
  151. {
  152. return ch >= '0' && ch <= '9';
  153. }
  154. int ParseToken ()
  155. {
  156. int ch = Peek ();
  157. switch (ch)
  158. {
  159. case -1:
  160. return Token.EOF;
  161. case '/':
  162. GetChar ();
  163. if (Peek () == '/')
  164. {
  165. GetChar ();
  166. return Token.SLASH2;
  167. }
  168. return Token.SLASH;
  169. case '.':
  170. GetChar ();
  171. if (Peek () == '.')
  172. {
  173. GetChar ();
  174. return Token.DOT2;
  175. }
  176. else if (IsDigit (Peek ()))
  177. {
  178. PutBack ();
  179. return ParseNumber ();
  180. }
  181. return Token.DOT;
  182. case ':':
  183. GetChar ();
  184. if (Peek () == ':')
  185. {
  186. GetChar ();
  187. return Token.COLON2;
  188. }
  189. return Token.COLON;
  190. case ',':
  191. GetChar ();
  192. return Token.COMMA;
  193. case '@':
  194. GetChar ();
  195. return Token.AT;
  196. case '[':
  197. GetChar ();
  198. return Token.BRACKET_OPEN;
  199. case ']':
  200. GetChar ();
  201. return Token.BRACKET_CLOSE;
  202. case '(':
  203. GetChar ();
  204. return Token.PAREN_OPEN;
  205. case ')':
  206. GetChar ();
  207. return Token.PAREN_CLOSE;
  208. case '+':
  209. GetChar ();
  210. return Token.PLUS;
  211. case '-':
  212. GetChar ();
  213. return Token.MINUS;
  214. case '*':
  215. GetChar ();
  216. return Token.ASTERISK;
  217. case '$':
  218. GetChar ();
  219. return Token.DOLLAR;
  220. case '|':
  221. GetChar ();
  222. return Token.BAR;
  223. case '=':
  224. GetChar ();
  225. return Token.EQ;
  226. case '!':
  227. GetChar ();
  228. if (Peek () == '=')
  229. {
  230. GetChar ();
  231. return Token.NE;
  232. }
  233. break;
  234. case '>':
  235. GetChar ();
  236. if (Peek () == '=')
  237. {
  238. GetChar ();
  239. return Token.GE;
  240. }
  241. return Token.GT;
  242. case '<':
  243. GetChar ();
  244. if (Peek () == '=')
  245. {
  246. GetChar ();
  247. return Token.LE;
  248. }
  249. return Token.LT;
  250. case '\'':
  251. return ParseLiteral ();
  252. case '\"':
  253. return ParseLiteral ();
  254. default:
  255. {
  256. if (IsDigit (ch))
  257. {
  258. return ParseNumber ();
  259. }
  260. else
  261. {
  262. return ParseIdentifier ();
  263. }
  264. }
  265. }
  266. return Token.ERROR;
  267. }
  268. ///////////////////////////
  269. // yyParser.yyInput methods
  270. ///////////////////////////
  271. /** move on to next token.
  272. @return false if positioned beyond tokens.
  273. @throws IOException on input error.
  274. */
  275. public bool advance ()
  276. {
  277. m_objToken = null;
  278. m_iToken = ParseToken ();
  279. SkipWhitespace ();
  280. return (m_iToken != Token.EOF);
  281. }
  282. /** classifies current token.
  283. Should not be called if advance() returned false.
  284. @return current %token or single character.
  285. */
  286. public int token ()
  287. {
  288. return m_iToken;
  289. }
  290. /** associated with current token.
  291. Should not be called if advance() returned false.
  292. @return value for token().
  293. */
  294. public Object value ()
  295. {
  296. return m_objToken;
  297. }
  298. }
  299. }