Tokenizer.cs 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. //
  2. // System.Xml.XPath.Tokenizer
  3. //
  4. // Author:
  5. // Piers Haken ([email protected])
  6. //
  7. // (C) 2002 Piers Haken
  8. //
  9. using System;
  10. using System.IO;
  11. using System.Text;
  12. using System.Collections;
  13. using Mono.Xml.XPath;
  14. using Mono.Xml.XPath.yyParser;
  15. namespace System.Xml.XPath
  16. {
  17. internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
  18. {
  19. private char [] m_rgchInput;
  20. private int m_ich;
  21. private int m_cch;
  22. // private System.IO.StreamReader m_input;
  23. private int m_iToken;
  24. private Object m_objToken;
  25. private static Hashtable m_mapTokens = new Hashtable ();
  26. private static readonly Object [] rgTokenMap =
  27. {
  28. Token.AND, "and",
  29. Token.OR, "or",
  30. Token.DIV, "div",
  31. Token.MOD, "mod",
  32. Token.ANCESTOR, "ancestor",
  33. Token.ANCESTOR_OR_SELF, "ancestor-or-self",
  34. Token.ATTRIBUTE, "attribute",
  35. Token.CHILD, "child",
  36. Token.DESCENDANT, "descendant",
  37. Token.DESCENDANT_OR_SELF, "descendant-or-self",
  38. Token.FOLLOWING, "following",
  39. Token.FOLLOWING_SIBLING, "following-sibling",
  40. Token.NAMESPACE, "namespace",
  41. Token.PARENT, "parent",
  42. Token.PRECEDING, "preceding",
  43. Token.PRECEDING_SIBLING, "preceding-sibling",
  44. Token.SELF, "self",
  45. Token.COMMENT, "comment",
  46. Token.TEXT, "text",
  47. Token.PROCESSING_INSTRUCTION, "processing-instruction",
  48. Token.NODE, "node",
  49. };
  50. static Tokenizer ()
  51. {
  52. for (int i = 0; i < rgTokenMap.Length; i += 2)
  53. m_mapTokens.Add (rgTokenMap [i + 1], rgTokenMap [i]);
  54. }
  55. /* public Tokenizer (StreamReader input)
  56. {
  57. m_input = input;
  58. SkipWhitespace ();
  59. }*/
  60. public Tokenizer (string strInput)
  61. {
  62. m_rgchInput = strInput.ToCharArray ();
  63. m_ich = 0;
  64. m_cch = strInput.Length;
  65. SkipWhitespace ();
  66. }
  67. private int Peek ()
  68. {
  69. if (m_ich >= m_cch)
  70. return -1;
  71. return m_rgchInput [m_ich];
  72. //return m_input.Peek ();
  73. }
  74. private int GetChar ()
  75. {
  76. if (m_ich >= m_cch)
  77. return -1;
  78. return m_rgchInput [m_ich++];
  79. //return m_input.Read ();
  80. }
  81. private void SkipWhitespace ()
  82. {
  83. while (IsWhitespace (Peek ()))
  84. GetChar ();
  85. }
  86. [MonoTODO]
  87. private int ParseNumber ()
  88. {
  89. StringBuilder sb = new StringBuilder ();
  90. while (IsDigit (Peek ()))
  91. sb.Append ((char) GetChar ());
  92. // TODO: doesn't handle '3.' error case
  93. if (Peek () == '.')
  94. {
  95. sb.Append ((char) GetChar ());
  96. while (IsDigit (Peek ()))
  97. sb.Append ((char) GetChar ());
  98. }
  99. m_objToken = Double.Parse (sb.ToString ());
  100. return Token.NUMBER;
  101. }
  102. private int ParseLiteral ()
  103. {
  104. StringBuilder sb = new StringBuilder ();
  105. int chInit = GetChar ();
  106. int ch;
  107. while ((ch = Peek ()) != chInit)
  108. {
  109. if (ch == -1)
  110. return Token.ERROR;
  111. sb.Append ((char) GetChar ());
  112. }
  113. GetChar ();
  114. m_objToken = sb.ToString ();
  115. return Token.LITERAL;
  116. }
  117. private int ParseIdentifier ()
  118. {
  119. StringBuilder sb = new StringBuilder ();
  120. while (true)
  121. {
  122. int ch = Peek ();
  123. if (ch == '_' ||
  124. (ch >= 'a' && ch <= 'z') ||
  125. (ch >= 'A' && ch <= 'Z'))
  126. {
  127. sb.Append ((char) GetChar ());
  128. }
  129. else
  130. break;
  131. }
  132. String strToken = sb.ToString ();
  133. Object objToken = m_mapTokens [strToken];
  134. if (objToken != null)
  135. {
  136. return (int) objToken;
  137. }
  138. else
  139. {
  140. m_objToken = strToken;
  141. SkipWhitespace ();
  142. if (Peek () == '(')
  143. return Token.FUNCTION_NAME;
  144. return Token.NCName;
  145. }
  146. }
  147. private static bool IsWhitespace (int ch)
  148. {
  149. return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
  150. }
  151. private static bool IsDigit (int ch)
  152. {
  153. return ch >= '0' && ch <= '9';
  154. }
  155. int ParseToken ()
  156. {
  157. // while (IsWhitespace (Peek ()))
  158. // GetChar ();
  159. switch (Peek ())
  160. {
  161. case -1:
  162. return Token.EOF;
  163. case '/':
  164. GetChar ();
  165. if (Peek () == '/')
  166. {
  167. GetChar ();
  168. return Token.SLASH2;
  169. }
  170. return Token.SLASH;
  171. case '.':
  172. GetChar ();
  173. if (Peek () == '.')
  174. {
  175. GetChar ();
  176. return Token.DOT2;
  177. }
  178. else if (Peek () >= '0' && Peek () <= '9')
  179. {
  180. return ParseNumber ();
  181. }
  182. return Token.DOT;
  183. case ':':
  184. GetChar ();
  185. if (Peek () == ':')
  186. {
  187. GetChar ();
  188. return Token.COLON2;
  189. }
  190. return Token.COLON;
  191. case ',':
  192. GetChar ();
  193. return Token.COMMA;
  194. case '@':
  195. GetChar ();
  196. return Token.AT;
  197. case '[':
  198. GetChar ();
  199. return Token.BRACKET_OPEN;
  200. case ']':
  201. GetChar ();
  202. return Token.BRACKET_CLOSE;
  203. case '(':
  204. GetChar ();
  205. return Token.PAREN_OPEN;
  206. case ')':
  207. GetChar ();
  208. return Token.PAREN_CLOSE;
  209. case '+':
  210. GetChar ();
  211. return Token.PLUS;
  212. case '-':
  213. GetChar ();
  214. return Token.MINUS;
  215. case '*':
  216. GetChar ();
  217. return Token.ASTERISK;
  218. case '$':
  219. GetChar ();
  220. return Token.DOLLAR;
  221. case '|':
  222. GetChar ();
  223. return Token.BAR;
  224. case '=':
  225. GetChar ();
  226. return Token.EQ;
  227. case '!':
  228. GetChar ();
  229. if (Peek () == '=')
  230. {
  231. GetChar ();
  232. return Token.NE;
  233. }
  234. break;
  235. case '>':
  236. GetChar ();
  237. if (Peek () == '=')
  238. {
  239. GetChar ();
  240. return Token.GE;
  241. }
  242. return Token.GT;
  243. case '<':
  244. GetChar ();
  245. if (Peek () == '=')
  246. {
  247. GetChar ();
  248. return Token.LE;
  249. }
  250. return Token.LT;
  251. case '\'':
  252. return ParseLiteral ();
  253. case '\"':
  254. return ParseLiteral ();
  255. default:
  256. {
  257. if (IsDigit (Peek ()))
  258. {
  259. return ParseNumber ();
  260. }
  261. else
  262. {
  263. return ParseIdentifier ();
  264. }
  265. }
  266. }
  267. return Token.ERROR;
  268. }
  269. ///////////////////////////
  270. // yyParser.yyInput methods
  271. ///////////////////////////
  272. /** move on to next token.
  273. @return false if positioned beyond tokens.
  274. @throws IOException on input error.
  275. */
  276. public bool advance ()
  277. {
  278. m_objToken = null;
  279. m_iToken = ParseToken ();
  280. SkipWhitespace ();
  281. return (m_iToken != Token.EOF);
  282. }
  283. /** classifies current token.
  284. Should not be called if advance() returned false.
  285. @return current %token or single character.
  286. */
  287. public int token ()
  288. {
  289. return m_iToken;
  290. }
  291. /** associated with current token.
  292. Should not be called if advance() returned false.
  293. @return value for token().
  294. */
  295. public Object value ()
  296. {
  297. return m_objToken;
  298. }
  299. }
  300. }