Tokenizer.cs 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. //
  2. // System.Xml.XPath.Tokenizer
  3. //
  4. // Author:
  5. // Piers Haken ([email protected])
  6. //
  7. // (C) 2002 Piers Haken
  8. //
  9. using System;
  10. using System.IO;
  11. using System.Text;
  12. using System.Collections;
  13. using Mono.Xml.XPath;
  14. using Mono.Xml.XPath.yyParser;
  15. namespace System.Xml.XPath
  16. {
  17. internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
  18. {
  19. private char [] m_rgchInput;
  20. private int m_ich;
  21. private int m_cch;
  22. private int m_iToken;
  23. private Object m_objToken;
  24. private static Hashtable s_mapTokens = new Hashtable ();
  25. private static readonly Object [] s_rgTokenMap =
  26. {
  27. Token.AND, "and",
  28. Token.OR, "or",
  29. Token.DIV, "div",
  30. Token.MOD, "mod",
  31. Token.ANCESTOR, "ancestor",
  32. Token.ANCESTOR_OR_SELF, "ancestor-or-self",
  33. Token.ATTRIBUTE, "attribute",
  34. Token.CHILD, "child",
  35. Token.DESCENDANT, "descendant",
  36. Token.DESCENDANT_OR_SELF, "descendant-or-self",
  37. Token.FOLLOWING, "following",
  38. Token.FOLLOWING_SIBLING, "following-sibling",
  39. Token.NAMESPACE, "namespace",
  40. Token.PARENT, "parent",
  41. Token.PRECEDING, "preceding",
  42. Token.PRECEDING_SIBLING, "preceding-sibling",
  43. Token.SELF, "self",
  44. Token.COMMENT, "comment",
  45. Token.TEXT, "text",
  46. Token.PROCESSING_INSTRUCTION, "processing-instruction",
  47. Token.NODE, "node",
  48. };
  49. private const char EOL = '\0';
  50. static Tokenizer ()
  51. {
  52. for (int i = 0; i < s_rgTokenMap.Length; i += 2)
  53. s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
  54. }
  55. public Tokenizer (string strInput)
  56. {
  57. m_rgchInput = strInput.ToCharArray ();
  58. m_ich = 0;
  59. m_cch = strInput.Length;
  60. SkipWhitespace ();
  61. }
  62. private char Peek ()
  63. {
  64. if (m_ich >= m_cch)
  65. return EOL;
  66. return m_rgchInput [m_ich];
  67. }
  68. private char GetChar ()
  69. {
  70. if (m_ich >= m_cch)
  71. return EOL;
  72. return m_rgchInput [m_ich++];
  73. }
  74. private char PutBack ()
  75. {
  76. if (m_ich == 0)
  77. throw new XPathException ("invalid tokenizer state"); // TODO: better description
  78. return m_rgchInput [--m_ich];
  79. }
  80. private void SkipWhitespace ()
  81. {
  82. while (IsWhitespace (Peek ()))
  83. GetChar ();
  84. }
  85. [MonoTODO]
  86. private int ParseNumber ()
  87. {
  88. StringBuilder sb = new StringBuilder ();
  89. while (IsDigit (Peek ()))
  90. sb.Append ((char) GetChar ());
  91. // TODO: doesn't handle '3.' error case
  92. if (Peek () == '.')
  93. {
  94. sb.Append ((char) GetChar ());
  95. while (IsDigit (Peek ()))
  96. sb.Append ((char) GetChar ());
  97. }
  98. m_objToken = Double.Parse (sb.ToString ());
  99. return Token.NUMBER;
  100. }
  101. private int ParseLiteral ()
  102. {
  103. StringBuilder sb = new StringBuilder ();
  104. char chInit = GetChar ();
  105. char ch;
  106. while ((ch = Peek ()) != chInit)
  107. {
  108. if (ch == EOL)
  109. return Token.ERROR;
  110. sb.Append ((char) GetChar ());
  111. }
  112. GetChar ();
  113. m_objToken = sb.ToString ();
  114. return Token.LITERAL;
  115. }
  116. private int ParseIdentifier ()
  117. {
  118. StringBuilder sb = new StringBuilder ();
  119. char ch;
  120. while ((ch = Peek ()) == '_' || ch == '-' || Char.IsLetterOrDigit (ch))
  121. sb.Append ((char) GetChar ());
  122. String strToken = sb.ToString ();
  123. Object objToken = s_mapTokens [strToken];
  124. if (objToken != null)
  125. return (int) objToken;
  126. m_objToken = strToken;
  127. SkipWhitespace ();
  128. if (Peek () == '(')
  129. return Token.FUNCTION_NAME;
  130. return Token.NCName;
  131. }
  132. private static bool IsWhitespace (char ch)
  133. {
  134. // return Char.IsWhiteSpace (ch);
  135. return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
  136. }
  137. private static bool IsDigit (char ch)
  138. {
  139. // return Char.IsDigit (ch);
  140. return ch >= '0' && ch <= '9';
  141. }
  142. int ParseToken ()
  143. {
  144. char ch = Peek ();
  145. switch (ch)
  146. {
  147. case EOL:
  148. return Token.EOF;
  149. case '/':
  150. GetChar ();
  151. if (Peek () == '/')
  152. {
  153. GetChar ();
  154. return Token.SLASH2;
  155. }
  156. return Token.SLASH;
  157. case '.':
  158. GetChar ();
  159. if (Peek () == '.')
  160. {
  161. GetChar ();
  162. return Token.DOT2;
  163. }
  164. else if (IsDigit (Peek ()))
  165. {
  166. PutBack ();
  167. return ParseNumber ();
  168. }
  169. return Token.DOT;
  170. case ':':
  171. GetChar ();
  172. if (Peek () == ':')
  173. {
  174. GetChar ();
  175. return Token.COLON2;
  176. }
  177. return Token.COLON;
  178. case ',':
  179. GetChar ();
  180. return Token.COMMA;
  181. case '@':
  182. GetChar ();
  183. return Token.AT;
  184. case '[':
  185. GetChar ();
  186. return Token.BRACKET_OPEN;
  187. case ']':
  188. GetChar ();
  189. return Token.BRACKET_CLOSE;
  190. case '(':
  191. GetChar ();
  192. return Token.PAREN_OPEN;
  193. case ')':
  194. GetChar ();
  195. return Token.PAREN_CLOSE;
  196. case '+':
  197. GetChar ();
  198. return Token.PLUS;
  199. case '-':
  200. GetChar ();
  201. return Token.MINUS;
  202. case '*':
  203. GetChar ();
  204. return Token.ASTERISK;
  205. case '$':
  206. GetChar ();
  207. return Token.DOLLAR;
  208. case '|':
  209. GetChar ();
  210. return Token.BAR;
  211. case '=':
  212. GetChar ();
  213. return Token.EQ;
  214. case '!':
  215. GetChar ();
  216. if (Peek () == '=')
  217. {
  218. GetChar ();
  219. return Token.NE;
  220. }
  221. break;
  222. case '>':
  223. GetChar ();
  224. if (Peek () == '=')
  225. {
  226. GetChar ();
  227. return Token.GE;
  228. }
  229. return Token.GT;
  230. case '<':
  231. GetChar ();
  232. if (Peek () == '=')
  233. {
  234. GetChar ();
  235. return Token.LE;
  236. }
  237. return Token.LT;
  238. case '\'':
  239. return ParseLiteral ();
  240. case '\"':
  241. return ParseLiteral ();
  242. default:
  243. {
  244. if (IsDigit (ch))
  245. {
  246. return ParseNumber ();
  247. }
  248. else
  249. {
  250. return ParseIdentifier ();
  251. }
  252. }
  253. }
  254. return Token.ERROR;
  255. }
  256. ///////////////////////////
  257. // yyParser.yyInput methods
  258. ///////////////////////////
  259. /** move on to next token.
  260. @return false if positioned beyond tokens.
  261. @throws IOException on input error.
  262. */
  263. public bool advance ()
  264. {
  265. m_objToken = null;
  266. m_iToken = ParseToken ();
  267. SkipWhitespace ();
  268. return (m_iToken != Token.EOF);
  269. }
  270. /** classifies current token.
  271. Should not be called if advance() returned false.
  272. @return current %token or single character.
  273. */
  274. public int token ()
  275. {
  276. return m_iToken;
  277. }
  278. /** associated with current token.
  279. Should not be called if advance() returned false.
  280. @return value for token().
  281. */
  282. public Object value ()
  283. {
  284. return m_objToken;
  285. }
  286. }
  287. }