Tokenizer.cs 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. //
  2. // System.Xml.XPath.Tokenizer
  3. //
  4. // Author:
  5. // Piers Haken ([email protected])
  6. //
  7. // (C) 2002 Piers Haken
  8. //
  9. using System;
  10. using System.IO;
  11. using System.Text;
  12. using System.Collections;
  13. using Mono.Xml.XPath;
  14. using Mono.Xml.XPath.yyParser;
  15. namespace System.Xml.XPath
  16. {
  17. internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
  18. {
  19. private char [] m_rgchInput;
  20. private int m_ich;
  21. private int m_cch;
  22. // private System.IO.StreamReader m_input;
  23. private int m_iToken;
  24. private Object m_objToken;
  25. private static Hashtable m_mapTokens = new Hashtable ();
  26. private static readonly Object [] rgTokenMap =
  27. {
  28. Token.AND, "and",
  29. Token.OR, "or",
  30. Token.DIV, "div",
  31. Token.MOD, "mod",
  32. Token.ANCESTOR, "ancestor",
  33. Token.ANCESTOR_OR_SELF, "ancestor-or-self",
  34. Token.ATTRIBUTE, "attribute",
  35. Token.CHILD, "child",
  36. Token.DESCENDANT, "descendant",
  37. Token.DESCENDANT_OR_SELF, "descendant-or-self",
  38. Token.FOLLOWING, "following",
  39. Token.FOLLOWING_SIBLING, "following-sibling",
  40. Token.NAMESPACE, "namespace",
  41. Token.PARENT, "parent",
  42. Token.PRECEDING, "preceding",
  43. Token.PRECEDING_SIBLING, "preceding-sibling",
  44. Token.SELF, "self",
  45. Token.COMMENT, "comment",
  46. Token.TEXT, "text",
  47. Token.PROCESSING_INSTRUCTION, "processing-instruction",
  48. Token.NODE, "node",
  49. };
  50. static Tokenizer ()
  51. {
  52. for (int i = 0; i < rgTokenMap.Length; i += 2)
  53. m_mapTokens.Add (rgTokenMap [i + 1], rgTokenMap [i]);
  54. }
  55. public Tokenizer (string strInput)
  56. {
  57. m_rgchInput = strInput.ToCharArray ();
  58. m_ich = 0;
  59. m_cch = strInput.Length;
  60. SkipWhitespace ();
  61. }
  62. private int Peek ()
  63. {
  64. if (m_ich >= m_cch)
  65. return -1;
  66. return m_rgchInput [m_ich];
  67. }
  68. private int GetChar ()
  69. {
  70. if (m_ich >= m_cch)
  71. return -1;
  72. return m_rgchInput [m_ich++];
  73. }
  74. private void SkipWhitespace ()
  75. {
  76. while (IsWhitespace (Peek ()))
  77. GetChar ();
  78. }
  79. [MonoTODO]
  80. private int ParseNumber ()
  81. {
  82. StringBuilder sb = new StringBuilder ();
  83. while (IsDigit (Peek ()))
  84. sb.Append ((char) GetChar ());
  85. // TODO: doesn't handle '3.' error case
  86. if (Peek () == '.')
  87. {
  88. sb.Append ((char) GetChar ());
  89. while (IsDigit (Peek ()))
  90. sb.Append ((char) GetChar ());
  91. }
  92. m_objToken = Double.Parse (sb.ToString ());
  93. return Token.NUMBER;
  94. }
  95. private int ParseLiteral ()
  96. {
  97. StringBuilder sb = new StringBuilder ();
  98. int chInit = GetChar ();
  99. int ch;
  100. while ((ch = Peek ()) != chInit)
  101. {
  102. if (ch == -1)
  103. return Token.ERROR;
  104. sb.Append ((char) GetChar ());
  105. }
  106. GetChar ();
  107. m_objToken = sb.ToString ();
  108. return Token.LITERAL;
  109. }
  110. private int ParseIdentifier ()
  111. {
  112. StringBuilder sb = new StringBuilder ();
  113. while (true)
  114. {
  115. int ch = Peek ();
  116. if (ch == '_' || ch == '-' ||
  117. (ch >= 'a' && ch <= 'z') ||
  118. (ch >= 'A' && ch <= 'Z'))
  119. {
  120. sb.Append ((char) GetChar ());
  121. }
  122. else
  123. break;
  124. }
  125. String strToken = sb.ToString ();
  126. Object objToken = m_mapTokens [strToken];
  127. if (objToken != null)
  128. {
  129. return (int) objToken;
  130. }
  131. else
  132. {
  133. m_objToken = strToken;
  134. SkipWhitespace ();
  135. if (Peek () == '(')
  136. return Token.FUNCTION_NAME;
  137. return Token.NCName;
  138. }
  139. }
  140. private static bool IsWhitespace (int ch)
  141. {
  142. return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
  143. }
  144. private static bool IsDigit (int ch)
  145. {
  146. return ch >= '0' && ch <= '9';
  147. }
  148. int ParseToken ()
  149. {
  150. switch (Peek ())
  151. {
  152. case -1:
  153. return Token.EOF;
  154. case '/':
  155. GetChar ();
  156. if (Peek () == '/')
  157. {
  158. GetChar ();
  159. return Token.SLASH2;
  160. }
  161. return Token.SLASH;
  162. case '.':
  163. GetChar ();
  164. if (Peek () == '.')
  165. {
  166. GetChar ();
  167. return Token.DOT2;
  168. }
  169. else if (Peek () >= '0' && Peek () <= '9')
  170. {
  171. return ParseNumber ();
  172. }
  173. return Token.DOT;
  174. case ':':
  175. GetChar ();
  176. if (Peek () == ':')
  177. {
  178. GetChar ();
  179. return Token.COLON2;
  180. }
  181. return Token.COLON;
  182. case ',':
  183. GetChar ();
  184. return Token.COMMA;
  185. case '@':
  186. GetChar ();
  187. return Token.AT;
  188. case '[':
  189. GetChar ();
  190. return Token.BRACKET_OPEN;
  191. case ']':
  192. GetChar ();
  193. return Token.BRACKET_CLOSE;
  194. case '(':
  195. GetChar ();
  196. return Token.PAREN_OPEN;
  197. case ')':
  198. GetChar ();
  199. return Token.PAREN_CLOSE;
  200. case '+':
  201. GetChar ();
  202. return Token.PLUS;
  203. case '-':
  204. GetChar ();
  205. return Token.MINUS;
  206. case '*':
  207. GetChar ();
  208. return Token.ASTERISK;
  209. case '$':
  210. GetChar ();
  211. return Token.DOLLAR;
  212. case '|':
  213. GetChar ();
  214. return Token.BAR;
  215. case '=':
  216. GetChar ();
  217. return Token.EQ;
  218. case '!':
  219. GetChar ();
  220. if (Peek () == '=')
  221. {
  222. GetChar ();
  223. return Token.NE;
  224. }
  225. break;
  226. case '>':
  227. GetChar ();
  228. if (Peek () == '=')
  229. {
  230. GetChar ();
  231. return Token.GE;
  232. }
  233. return Token.GT;
  234. case '<':
  235. GetChar ();
  236. if (Peek () == '=')
  237. {
  238. GetChar ();
  239. return Token.LE;
  240. }
  241. return Token.LT;
  242. case '\'':
  243. return ParseLiteral ();
  244. case '\"':
  245. return ParseLiteral ();
  246. default:
  247. {
  248. if (IsDigit (Peek ()))
  249. {
  250. return ParseNumber ();
  251. }
  252. else
  253. {
  254. return ParseIdentifier ();
  255. }
  256. }
  257. }
  258. return Token.ERROR;
  259. }
  260. ///////////////////////////
  261. // yyParser.yyInput methods
  262. ///////////////////////////
  263. /** move on to next token.
  264. @return false if positioned beyond tokens.
  265. @throws IOException on input error.
  266. */
  267. public bool advance ()
  268. {
  269. m_objToken = null;
  270. m_iToken = ParseToken ();
  271. SkipWhitespace ();
  272. return (m_iToken != Token.EOF);
  273. }
  274. /** classifies current token.
  275. Should not be called if advance() returned false.
  276. @return current %token or single character.
  277. */
  278. public int token ()
  279. {
  280. return m_iToken;
  281. }
  282. /** associated with current token.
  283. Should not be called if advance() returned false.
  284. @return value for token().
  285. */
  286. public Object value ()
  287. {
  288. return m_objToken;
  289. }
  290. }
  291. }