Tokenizer.cs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521
  1. //
  2. // System.Xml.XPath.Tokenizer.cs / Mono.Xml.Xsl/PatternTokenizer.cs
  3. //
  4. // Author:
  5. // Piers Haken ([email protected])
  6. // Atsushi Enomoto ([email protected])
  7. //
  8. // (C) 2002 Piers Haken
  9. // (C) 2005 Novell Inc,
  10. //
  11. // IMPORTANT:
  12. //
  13. // Do not edit PatternTokenizer.cs. It is autogenerated.
  14. //
  15. //
  16. // Permission is hereby granted, free of charge, to any person obtaining
  17. // a copy of this software and associated documentation files (the
  18. // "Software"), to deal in the Software without restriction, including
  19. // without limitation the rights to use, copy, modify, merge, publish,
  20. // distribute, sublicense, and/or sell copies of the Software, and to
  21. // permit persons to whom the Software is furnished to do so, subject to
  22. // the following conditions:
  23. //
  24. // The above copyright notice and this permission notice shall be
  25. // included in all copies or substantial portions of the Software.
  26. //
  27. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  28. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  29. // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  30. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  31. // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  32. // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  33. // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  34. //
  35. using System;
  36. using System.Globalization;
  37. using System.IO;
  38. using System.Text;
  39. using System.Collections;
  40. using System.Xml;
  41. using System.Xml.XPath;
  42. using Mono.Xml.XPath;
  43. #if XSLT_PATTERN
  44. namespace Mono.Xml.Xsl
  45. #else
  46. namespace Mono.Xml.XPath
  47. #endif
  48. {
  49. internal class Tokenizer : yyParser.yyInput
  50. {
  51. private string m_rgchInput;
  52. private int m_ich;
  53. private int m_cch;
  54. private int m_iToken;
  55. private int m_iTokenPrev = Token.EOF;
  56. private Object m_objToken;
  57. private bool m_fPrevWasOperator = false;
  58. private bool m_fThisIsOperator = false;
  59. private static readonly Hashtable s_mapTokens = new Hashtable ();
  60. private static readonly Object [] s_rgTokenMap =
  61. {
  62. Token.AND, "and",
  63. Token.OR, "or",
  64. Token.DIV, "div",
  65. Token.MOD, "mod",
  66. Token.ANCESTOR, "ancestor",
  67. Token.ANCESTOR_OR_SELF, "ancestor-or-self",
  68. Token.ATTRIBUTE, "attribute",
  69. Token.CHILD, "child",
  70. Token.DESCENDANT, "descendant",
  71. Token.DESCENDANT_OR_SELF, "descendant-or-self",
  72. Token.FOLLOWING, "following",
  73. Token.FOLLOWING_SIBLING, "following-sibling",
  74. Token.NAMESPACE, "namespace",
  75. Token.PARENT, "parent",
  76. Token.PRECEDING, "preceding",
  77. Token.PRECEDING_SIBLING, "preceding-sibling",
  78. Token.SELF, "self",
  79. Token.COMMENT, "comment",
  80. Token.TEXT, "text",
  81. Token.PROCESSING_INSTRUCTION, "processing-instruction",
  82. Token.NODE, "node",
  83. };
  84. private const char EOL = '\0';
  85. static Tokenizer ()
  86. {
  87. for (int i = 0; i < s_rgTokenMap.Length; i += 2)
  88. s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
  89. }
  90. public Tokenizer (string strInput)
  91. {
  92. //Console.WriteLine ("Tokenizing: " + strInput);
  93. m_rgchInput = strInput;
  94. m_ich = 0;
  95. m_cch = strInput.Length;
  96. SkipWhitespace ();
  97. }
  98. private char Peek (int iOffset)
  99. {
  100. if (m_ich + iOffset>= m_cch)
  101. return EOL;
  102. return m_rgchInput [m_ich + iOffset];
  103. }
  104. private char Peek ()
  105. {
  106. return Peek (0);
  107. }
  108. private char GetChar ()
  109. {
  110. if (m_ich >= m_cch)
  111. return EOL;
  112. return m_rgchInput [m_ich++];
  113. }
  114. private char PutBack ()
  115. {
  116. if (m_ich == 0)
  117. throw new XPathException ("XPath parser returned an error status: invalid tokenizer state.");
  118. return m_rgchInput [--m_ich];
  119. }
  120. private bool SkipWhitespace () // returns trus if any whitespace was skipped
  121. {
  122. if (!IsWhitespace (Peek ()))
  123. return false;
  124. while (IsWhitespace (Peek ()))
  125. GetChar ();
  126. return true;
  127. }
  128. private int ParseNumber ()
  129. {
  130. StringBuilder sb = new StringBuilder ();
  131. while (IsDigit (Peek ()))
  132. sb.Append ((char) GetChar ());
  133. // don't handle '3.' as an error case (it is not. XPath 3.7 syntax [30])
  134. if (Peek () == '.')
  135. {
  136. sb.Append ((char) GetChar ());
  137. while (IsDigit (Peek ()))
  138. sb.Append ((char) GetChar ());
  139. }
  140. m_objToken = Double.Parse (sb.ToString (), NumberFormatInfo.InvariantInfo);
  141. return Token.NUMBER;
  142. }
  143. private int ParseLiteral ()
  144. {
  145. StringBuilder sb = new StringBuilder ();
  146. char chInit = GetChar ();
  147. char ch;
  148. while ((ch = Peek ()) != chInit)
  149. {
  150. if (ch == EOL)
  151. throw new XPathException ("unmatched "+chInit+" in expression");
  152. sb.Append ((char) GetChar ());
  153. }
  154. GetChar ();
  155. m_objToken = sb.ToString ();
  156. return Token.LITERAL;
  157. }
  158. private string ReadIdentifier ()
  159. {
  160. StringBuilder sb = new StringBuilder ();
  161. char ch = Peek ();
  162. if (!Char.IsLetter (ch) && ch != '_')
  163. return null;
  164. sb.Append ((char) GetChar ());
  165. while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
  166. sb.Append ((char) GetChar ());
  167. SkipWhitespace ();
  168. return sb.ToString ();
  169. }
  170. private int ParseIdentifier ()
  171. {
  172. string strToken = ReadIdentifier ();
  173. Object objToken = s_mapTokens [strToken];
  174. int iToken = (objToken != null) ? (int) objToken : Token.QName;
  175. m_objToken = strToken;
  176. char ch = Peek ();
  177. if (ch == ':')
  178. {
  179. if (Peek (1) == ':')
  180. {
  181. // If the two characters following an NCName (possibly
  182. // after intervening ExprWhitespace) are ::, then the
  183. // token must be recognized as an AxisName.
  184. if (objToken == null || !IsAxisName (iToken))
  185. throw new XPathException ("invalid axis name: '"+strToken+"'");
  186. return iToken;
  187. }
  188. GetChar ();
  189. SkipWhitespace ();
  190. ch = Peek ();
  191. if (ch == '*')
  192. {
  193. GetChar ();
  194. m_objToken = new XmlQualifiedName ("", strToken);
  195. return Token.QName;
  196. }
  197. string strToken2 = ReadIdentifier ();
  198. if (strToken2 == null)
  199. throw new XPathException ("invalid QName: "+strToken+":"+(char)ch);
  200. ch = Peek ();
  201. m_objToken = new XmlQualifiedName (strToken2, strToken);
  202. if (ch == '(')
  203. return Token.FUNCTION_NAME;
  204. return Token.QName;
  205. }
  206. // If there is a preceding token and the preceding
  207. // token is not one of @, ::, (, [, , or an Operator,
  208. // then a * must be recognized as a MultiplyOperator
  209. // and an NCName must be recognized as an OperatorName.
  210. if (!IsFirstToken && !m_fPrevWasOperator)
  211. {
  212. if (objToken == null || !IsOperatorName (iToken))
  213. throw new XPathException ("invalid operator name: '"+strToken+"'");
  214. return iToken;
  215. }
  216. if (ch == '(')
  217. {
  218. // If the character following an NCName (possibly
  219. // after intervening ExprWhitespace) is (, then the
  220. // token must be recognized as a NodeType or a FunctionName.
  221. if (objToken == null)
  222. {
  223. m_objToken = new XmlQualifiedName (strToken, "");
  224. return Token.FUNCTION_NAME;
  225. }
  226. if (IsNodeType (iToken))
  227. return iToken;
  228. throw new XPathException ("invalid function name: '"+strToken+"'");
  229. }
  230. m_objToken = new XmlQualifiedName (strToken, "");
  231. return Token.QName;
  232. }
  233. private static bool IsWhitespace (char ch)
  234. {
  235. // return Char.IsWhiteSpace (ch);
  236. return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
  237. }
  238. private static bool IsDigit (char ch)
  239. {
  240. // return Char.IsDigit (ch);
  241. return ch >= '0' && ch <= '9';
  242. }
  243. int ParseToken ()
  244. {
  245. char ch = Peek ();
  246. switch (ch)
  247. {
  248. case EOL:
  249. return Token.EOF;
  250. case '/':
  251. m_fThisIsOperator = true;
  252. GetChar ();
  253. if (Peek () == '/')
  254. {
  255. GetChar ();
  256. return Token.SLASH2;
  257. }
  258. return Token.SLASH;
  259. case '.':
  260. GetChar ();
  261. if (Peek () == '.')
  262. {
  263. GetChar ();
  264. return Token.DOT2;
  265. }
  266. else if (IsDigit (Peek ()))
  267. {
  268. PutBack ();
  269. return ParseNumber ();
  270. }
  271. return Token.DOT;
  272. case ':':
  273. GetChar ();
  274. if (Peek () == ':')
  275. {
  276. m_fThisIsOperator = true;
  277. GetChar ();
  278. return Token.COLON2;
  279. }
  280. return Token.ERROR;
  281. case ',':
  282. m_fThisIsOperator = true;
  283. GetChar ();
  284. return Token.COMMA;
  285. case '@':
  286. m_fThisIsOperator = true;
  287. GetChar ();
  288. return Token.AT;
  289. case '[':
  290. m_fThisIsOperator = true;
  291. GetChar ();
  292. return Token.BRACKET_OPEN;
  293. case ']':
  294. GetChar ();
  295. return Token.BRACKET_CLOSE;
  296. case '(':
  297. m_fThisIsOperator = true;
  298. GetChar ();
  299. return Token.PAREN_OPEN;
  300. case ')':
  301. GetChar ();
  302. return Token.PAREN_CLOSE;
  303. case '+':
  304. m_fThisIsOperator = true;
  305. GetChar ();
  306. return Token.PLUS;
  307. case '-':
  308. m_fThisIsOperator = true;
  309. GetChar ();
  310. return Token.MINUS;
  311. case '*':
  312. GetChar ();
  313. if (!IsFirstToken && !m_fPrevWasOperator)
  314. {
  315. m_fThisIsOperator = true;
  316. return Token.MULTIPLY;
  317. }
  318. return Token.ASTERISK;
  319. case '$':
  320. GetChar ();
  321. m_fThisIsOperator = true;
  322. return Token.DOLLAR;
  323. case '|':
  324. m_fThisIsOperator = true;
  325. GetChar ();
  326. return Token.BAR;
  327. case '=':
  328. m_fThisIsOperator = true;
  329. GetChar ();
  330. return Token.EQ;
  331. case '!':
  332. GetChar ();
  333. if (Peek () == '=')
  334. {
  335. m_fThisIsOperator = true;
  336. GetChar ();
  337. return Token.NE;
  338. }
  339. break;
  340. case '>':
  341. m_fThisIsOperator = true;
  342. GetChar ();
  343. if (Peek () == '=')
  344. {
  345. GetChar ();
  346. return Token.GE;
  347. }
  348. return Token.GT;
  349. case '<':
  350. m_fThisIsOperator = true;
  351. GetChar ();
  352. if (Peek () == '=')
  353. {
  354. GetChar ();
  355. return Token.LE;
  356. }
  357. return Token.LT;
  358. case '\'':
  359. return ParseLiteral ();
  360. case '\"':
  361. return ParseLiteral ();
  362. default:
  363. if (IsDigit (ch))
  364. {
  365. return ParseNumber ();
  366. }
  367. else if (Char.IsLetter (ch) || ch == '_') // NCName
  368. {
  369. int iToken = ParseIdentifier ();
  370. if (IsOperatorName (iToken))
  371. m_fThisIsOperator = true;
  372. return iToken;
  373. }
  374. break;
  375. }
  376. throw new XPathException ("invalid token: '"+ch+"'");
  377. }
  378. ///////////////////////////
  379. // yyParser.yyInput methods
  380. ///////////////////////////
  381. /** move on to next token.
  382. @return false if positioned beyond tokens.
  383. @throws IOException on input error.
  384. */
  385. public bool advance ()
  386. {
  387. m_fThisIsOperator = false;
  388. m_objToken = null;
  389. m_iToken = ParseToken ();
  390. SkipWhitespace ();
  391. m_iTokenPrev = m_iToken;
  392. m_fPrevWasOperator = m_fThisIsOperator;
  393. return (m_iToken != Token.EOF);
  394. }
  395. /** classifies current token.
  396. Should not be called if advance() returned false.
  397. @return current %token or single character.
  398. */
  399. public int token ()
  400. {
  401. return m_iToken;
  402. }
  403. /** associated with current token.
  404. Should not be called if advance() returned false.
  405. @return value for token().
  406. */
  407. public Object value ()
  408. {
  409. return m_objToken;
  410. }
  411. private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; } }
  412. private bool IsNodeType (int iToken)
  413. {
  414. switch (iToken)
  415. {
  416. case Token.COMMENT:
  417. case Token.TEXT:
  418. case Token.PROCESSING_INSTRUCTION:
  419. case Token.NODE:
  420. return true;
  421. default:
  422. return false;
  423. }
  424. }
  425. private bool IsOperatorName (int iToken)
  426. {
  427. switch (iToken)
  428. {
  429. case Token.AND:
  430. case Token.OR:
  431. case Token.MOD:
  432. case Token.DIV:
  433. return true;
  434. default:
  435. return false;
  436. }
  437. }
  438. private bool IsAxisName (int iToken)
  439. {
  440. switch (iToken)
  441. {
  442. case Token.ATTRIBUTE:
  443. case Token.ANCESTOR:
  444. case Token.ANCESTOR_OR_SELF:
  445. case Token.CHILD:
  446. case Token.DESCENDANT:
  447. case Token.DESCENDANT_OR_SELF:
  448. case Token.FOLLOWING:
  449. case Token.FOLLOWING_SIBLING:
  450. case Token.NAMESPACE:
  451. case Token.PARENT:
  452. case Token.PRECEDING:
  453. case Token.PRECEDING_SIBLING:
  454. case Token.SELF:
  455. return true;
  456. default:
  457. return false;
  458. }
  459. }
  460. }
  461. }