Tokenizer.cs 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. //
  2. // System.Xml.XPath.Tokenizer
  3. //
  4. // Author:
  5. // Piers Haken ([email protected])
  6. //
  7. // (C) 2002 Piers Haken
  8. //
  9. using System;
  10. using System.IO;
  11. using System.Text;
  12. using System.Collections;
  13. using Mono.Xml.XPath;
  14. using Mono.Xml.XPath.yyParser;
  15. namespace System.Xml.XPath
  16. {
  17. internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
  18. {
  19. private char [] m_rgchInput;
  20. private int m_ich;
  21. private int m_cch;
  22. private int m_iToken;
  23. private Object m_objToken;
  24. private bool m_fPrevWasSpecial = false;
  25. private static readonly Hashtable s_mapTokens = new Hashtable ();
  26. private static readonly Object [] s_rgTokenMap =
  27. {
  28. Token.AND, "and",
  29. Token.OR, "or",
  30. Token.DIV, "div",
  31. Token.MOD, "mod",
  32. Token.ANCESTOR, "ancestor",
  33. Token.ANCESTOR_OR_SELF, "ancestor-or-self",
  34. Token.ATTRIBUTE, "attribute",
  35. Token.CHILD, "child",
  36. Token.DESCENDANT, "descendant",
  37. Token.DESCENDANT_OR_SELF, "descendant-or-self",
  38. Token.FOLLOWING, "following",
  39. Token.FOLLOWING_SIBLING, "following-sibling",
  40. Token.NAMESPACE, "namespace",
  41. Token.PARENT, "parent",
  42. Token.PRECEDING, "preceding",
  43. Token.PRECEDING_SIBLING, "preceding-sibling",
  44. Token.SELF, "self",
  45. Token.COMMENT, "comment",
  46. Token.TEXT, "text",
  47. Token.PROCESSING_INSTRUCTION, "processing-instruction",
  48. Token.NODE, "node",
  49. };
  50. private static readonly Hashtable s_mapfPrevWasSpecial = new Hashtable ();
  51. private static readonly int [] s_rgfPrevWasSpecial =
  52. {
  53. Token.AT,
  54. Token.COLON2,
  55. Token.PAREN_OPEN,
  56. Token.BRACKET_OPEN,
  57. Token.COMMA,
  58. Token.AND,
  59. Token.OR,
  60. Token.DIV,
  61. Token.MOD,
  62. Token.SLASH,
  63. Token.SLASH2,
  64. Token.BAR,
  65. Token.PLUS,
  66. Token.MINUS,
  67. Token.EQ,
  68. Token.NE,
  69. Token.LE,
  70. Token.LT,
  71. Token.GE,
  72. Token.GT,
  73. Token.ASTERISK,
  74. };
  75. private const char EOL = '\0';
  76. static Tokenizer ()
  77. {
  78. for (int i = 0; i < s_rgTokenMap.Length; i += 2)
  79. s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
  80. object objTmp = new Object ();
  81. for (int i = 0; i < s_rgfPrevWasSpecial.Length; i++)
  82. s_mapfPrevWasSpecial.Add (s_rgfPrevWasSpecial [i], null);
  83. }
  84. public Tokenizer (string strInput)
  85. {
  86. m_rgchInput = strInput.ToCharArray ();
  87. m_ich = 0;
  88. m_cch = strInput.Length;
  89. SkipWhitespace ();
  90. }
  91. private char Peek (int iOffset)
  92. {
  93. if (m_ich + iOffset>= m_cch)
  94. return EOL;
  95. return m_rgchInput [m_ich + iOffset];
  96. }
  97. private char Peek ()
  98. {
  99. return Peek (0);
  100. }
  101. private char GetChar ()
  102. {
  103. if (m_ich >= m_cch)
  104. return EOL;
  105. return m_rgchInput [m_ich++];
  106. }
  107. private char PutBack ()
  108. {
  109. if (m_ich == 0)
  110. throw new XPathException ("invalid tokenizer state"); // TODO: better description
  111. return m_rgchInput [--m_ich];
  112. }
  113. private bool SkipWhitespace () // returns trus if any whitespace was skipped
  114. {
  115. if (!IsWhitespace (Peek ()))
  116. return false;
  117. while (IsWhitespace (Peek ()))
  118. GetChar ();
  119. return true;
  120. }
  121. [MonoTODO]
  122. private int ParseNumber ()
  123. {
  124. StringBuilder sb = new StringBuilder ();
  125. while (IsDigit (Peek ()))
  126. sb.Append ((char) GetChar ());
  127. // TODO: doesn't handle '3.' error case
  128. if (Peek () == '.')
  129. {
  130. sb.Append ((char) GetChar ());
  131. while (IsDigit (Peek ()))
  132. sb.Append ((char) GetChar ());
  133. }
  134. m_objToken = Double.Parse (sb.ToString ());
  135. return Token.NUMBER;
  136. }
  137. private int ParseLiteral ()
  138. {
  139. StringBuilder sb = new StringBuilder ();
  140. char chInit = GetChar ();
  141. char ch;
  142. while ((ch = Peek ()) != chInit)
  143. {
  144. if (ch == EOL)
  145. return Token.ERROR;
  146. sb.Append ((char) GetChar ());
  147. }
  148. GetChar ();
  149. m_objToken = sb.ToString ();
  150. return Token.LITERAL;
  151. }
  152. private int ParseIdentifier ()
  153. {
  154. StringBuilder sb = new StringBuilder ();
  155. char ch;
  156. while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
  157. sb.Append ((char) GetChar ());
  158. String strToken = sb.ToString ();
  159. Object objToken = s_mapTokens [strToken];
  160. if (!m_fPrevWasSpecial && objToken != null)
  161. return (int) objToken;
  162. SkipWhitespace ();
  163. ch = Peek ();
  164. if (ch == '(')
  165. {
  166. if (objToken != null)
  167. return (int) objToken;
  168. m_objToken = strToken;
  169. return Token.FUNCTION_NAME;
  170. }
  171. else if (ch == ':' && Peek (1) == ':')
  172. {
  173. if (objToken != null)
  174. return (int) objToken;
  175. }
  176. m_objToken = strToken;
  177. return Token.NCName;
  178. }
  179. private static bool IsWhitespace (char ch)
  180. {
  181. // return Char.IsWhiteSpace (ch);
  182. return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
  183. }
  184. private static bool IsDigit (char ch)
  185. {
  186. // return Char.IsDigit (ch);
  187. return ch >= '0' && ch <= '9';
  188. }
  189. int ParseToken ()
  190. {
  191. char ch = Peek ();
  192. switch (ch)
  193. {
  194. case EOL:
  195. return Token.EOF;
  196. case '/':
  197. GetChar ();
  198. if (Peek () == '/')
  199. {
  200. GetChar ();
  201. return Token.SLASH2;
  202. }
  203. return Token.SLASH;
  204. case '.':
  205. GetChar ();
  206. if (Peek () == '.')
  207. {
  208. GetChar ();
  209. return Token.DOT2;
  210. }
  211. else if (IsDigit (Peek ()))
  212. {
  213. PutBack ();
  214. return ParseNumber ();
  215. }
  216. return Token.DOT;
  217. case ':':
  218. GetChar ();
  219. if (Peek () == ':')
  220. {
  221. GetChar ();
  222. return Token.COLON2;
  223. }
  224. return Token.COLON;
  225. case ',':
  226. GetChar ();
  227. return Token.COMMA;
  228. case '@':
  229. GetChar ();
  230. return Token.AT;
  231. case '[':
  232. GetChar ();
  233. return Token.BRACKET_OPEN;
  234. case ']':
  235. GetChar ();
  236. return Token.BRACKET_CLOSE;
  237. case '(':
  238. GetChar ();
  239. return Token.PAREN_OPEN;
  240. case ')':
  241. GetChar ();
  242. return Token.PAREN_CLOSE;
  243. case '+':
  244. GetChar ();
  245. return Token.PLUS;
  246. case '-':
  247. GetChar ();
  248. return Token.MINUS;
  249. case '*':
  250. GetChar ();
  251. return Token.ASTERISK;
  252. case '$':
  253. GetChar ();
  254. return Token.DOLLAR;
  255. case '|':
  256. GetChar ();
  257. return Token.BAR;
  258. case '=':
  259. GetChar ();
  260. return Token.EQ;
  261. case '!':
  262. GetChar ();
  263. if (Peek () == '=')
  264. {
  265. GetChar ();
  266. return Token.NE;
  267. }
  268. break;
  269. case '>':
  270. GetChar ();
  271. if (Peek () == '=')
  272. {
  273. GetChar ();
  274. return Token.GE;
  275. }
  276. return Token.GT;
  277. case '<':
  278. GetChar ();
  279. if (Peek () == '=')
  280. {
  281. GetChar ();
  282. return Token.LE;
  283. }
  284. return Token.LT;
  285. case '\'':
  286. return ParseLiteral ();
  287. case '\"':
  288. return ParseLiteral ();
  289. default:
  290. {
  291. if (IsDigit (ch))
  292. {
  293. return ParseNumber ();
  294. }
  295. else if (Char.IsLetter (ch) || ch == '_') // NCName
  296. {
  297. return ParseIdentifier ();
  298. }
  299. break;
  300. }
  301. }
  302. return Token.ERROR;
  303. }
  304. ///////////////////////////
  305. // yyParser.yyInput methods
  306. ///////////////////////////
  307. /** move on to next token.
  308. @return false if positioned beyond tokens.
  309. @throws IOException on input error.
  310. */
  311. public bool advance ()
  312. {
  313. m_objToken = null;
  314. m_iToken = ParseToken ();
  315. bool fWhitespace = SkipWhitespace ();
  316. m_fPrevWasSpecial = (!fWhitespace && s_mapfPrevWasSpecial.Contains (m_iToken));
  317. return (m_iToken != Token.EOF);
  318. }
  319. /** classifies current token.
  320. Should not be called if advance() returned false.
  321. @return current %token or single character.
  322. */
  323. public int token ()
  324. {
  325. return m_iToken;
  326. }
  327. /** associated with current token.
  328. Should not be called if advance() returned false.
  329. @return value for token().
  330. */
  331. public Object value ()
  332. {
  333. return m_objToken;
  334. }
  335. }
  336. }