lexer.nut 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. class Lexer {
  2. pos = 0;
  3. line = 0;
  4. buf = null;
  5. buflen = 0;
  6. // Operator table, mapping operator -> token name
  7. static optable = {
  8. "+": "PLUS",
  9. "-": "MINUS",
  10. "*": "MULTIPLY",
  11. ".": "PERIOD",
  12. "\\": "BACKSLASH",
  13. ":": "COLON",
  14. "%": "PERCENT",
  15. "|": "PIPE",
  16. "!": "EXCLAMATION",
  17. "?": "QUESTION",
  18. "#": "POUND",
  19. "&": "AMPERSAND",
  20. ";": "SEMI",
  21. ",": "COMMA",
  22. "(": "L_PAREN",
  23. ")": "R_PAREN",
  24. "<": "L_ANG",
  25. ">": "R_ANG",
  26. "{": "L_BRACE",
  27. "}": "R_BRACE",
  28. "[": "L_BRACKET",
  29. "]": "R_BRACKET",
  30. "=": "EQUALS"
  31. };
  32. // Initialize the Lexer's buffer. This resets the lexer's internal
  33. // state and subsequent tokens will be returned starting with the
  34. // beginning of the new buffer.
  35. function input(buf)
  36. {
  37. this.pos = 0;
  38. this.line = 0;
  39. this.buf = buf;
  40. this.buflen = buf.len();
  41. }
  42. // Get the next token from the current buffer. A token is an object with
  43. // the following properties:
  44. // - name: name of the pattern that this token matched (taken from rules).
  45. // - value: actual string value of the token.
  46. // - pos: offset in the current buffer where the token starts.
  47. //
  48. // If there are no more tokens in the buffer, returns null. In case of
  49. // an error throws Error.
  50. function token()
  51. {
  52. var _isnewline = function(c)
  53. {
  54. return c === '\r' || c === '\n';
  55. }
  56. var _isdigit = function (c)
  57. {
  58. return c >= '0' && c <= '9';
  59. }
  60. var _isalpha = function (c)
  61. {
  62. return (c >= 'a' && c <= 'z') ||
  63. (c >= 'A' && c <= 'Z') ||
  64. c === '_' || c === '$';
  65. }
  66. var _isalphanum = function(c)
  67. {
  68. return (c >= 'a' && c <= 'z') ||
  69. (c >= 'A' && c <= 'Z') ||
  70. (c >= '0' && c <= '9') ||
  71. c === '_' || c === '$';
  72. }
  73. var _process_number = function()
  74. {
  75. var endpos = this.pos + 1;
  76. var hasDot = false;
  77. while (endpos < this.buflen)
  78. {
  79. var ch = this.buf[endpos];
  80. if(!_isdigit(ch))
  81. {
  82. if(!hasDot && (ch == '.')) hasDot = true;
  83. else break;
  84. }
  85. endpos++;
  86. }
  87. var tok = {
  88. name: "NUMBER",
  89. value: this.buf.slice(this.pos, endpos),
  90. pos: this.pos, line: this.line
  91. };
  92. this.pos = endpos;
  93. return tok;
  94. }
  95. var _process_comment = function ()
  96. {
  97. var endpos = this.pos + 2;
  98. // Skip until the end of the line
  99. var c = this.buf[this.pos + 2];
  100. while (endpos < this.buflen &&
  101. !_isnewline(this.buf[endpos]))
  102. {
  103. endpos++;
  104. }
  105. var tok = {
  106. name: "COMMENT",
  107. value: this.buf.slice(this.pos, endpos),
  108. pos: this.pos, line: this.line
  109. };
  110. this.pos = endpos + 1;
  111. ++this.line;
  112. return tok;
  113. }
  114. var _process_identifier = function()
  115. {
  116. var endpos = this.pos + 1;
  117. while (endpos < this.buflen &&
  118. _isalphanum(this.buf[endpos]))
  119. {
  120. endpos++;
  121. }
  122. var tok = {
  123. name: "IDENTIFIER",
  124. value: this.buf.slice(this.pos, endpos),
  125. pos: this.pos, line: this.line
  126. };
  127. this.pos = endpos;
  128. return tok;
  129. }
  130. var _process_quote = function(quote)
  131. {
  132. // this.pos points at the opening quote. Find the ending quote.
  133. var end_index = this.buf.find(quote, this.pos + 1);
  134. while((this.buf[end_index-1] === '\\') && (this.buf[end_index-2] !== '\\') ){
  135. end_index = this.buf.find(quote, end_index + 1);
  136. }
  137. if (end_index === -1)
  138. {
  139. throw ("Unterminated quote at " + this.pos);
  140. }
  141. else
  142. {
  143. var tok = {
  144. name: "QUOTE",
  145. value: this.buf.slice(this.pos, end_index + 1),
  146. pos: this.pos, line: this.line
  147. };
  148. this.pos = end_index + 1;
  149. return tok;
  150. }
  151. }
  152. var _skipnontokens = function()
  153. {
  154. while (this.pos < this.buflen)
  155. {
  156. var c = this.buf[this.pos];
  157. if (c == ' ' || c == '\t')
  158. {
  159. ++this.pos;
  160. }
  161. else if (c == '\r' || c == '\n')
  162. {
  163. ++this.pos;
  164. ++this.line;
  165. }
  166. else
  167. {
  168. break;
  169. }
  170. }
  171. }
  172. _skipnontokens();
  173. if (this.pos >= this.buflen)
  174. {
  175. return null;
  176. }
  177. // The char at this.pos is part of a real token. Figure out which.
  178. var c = this.buf[this.pos];
  179. // '/' is treated specially, because it starts a comment if followed by
  180. // another '/'. If not followed by another '/', it's the DIVIDE
  181. // operator.
  182. if (c === '/')
  183. {
  184. var next_c = this.buf[this.pos + 1];
  185. if (next_c === '/')
  186. {
  187. return _process_comment();
  188. }
  189. else
  190. {
  191. return {name: "DIVIDE", value: '/', pos: this.pos++, line: this.line};
  192. }
  193. }
  194. else
  195. {
  196. // Look it up in the table of operators
  197. var op = table_rawget(this.optable, c.tochar(), false);
  198. if (op)
  199. {
  200. return {name: op, value: c, pos: this.pos++, line: this.line};
  201. }
  202. else
  203. {
  204. // Not an operator - so it's the beginning of another token.
  205. if (_isalpha(c))
  206. {
  207. return _process_identifier();
  208. }
  209. else if (_isdigit(c))
  210. {
  211. return _process_number();
  212. }
  213. else if (c === '"' || c === '\'')
  214. {
  215. return _process_quote(c.tochar());
  216. }
  217. else
  218. {
  219. throw ("Token error at " + this.pos);
  220. }
  221. }
  222. }
  223. }
  224. }
  225. var txt = "var lex = new Lexer(26.389, \"dad\");";
  226. var lex = new Lexer();
  227. /*
  228. var fd = file("lexer.nut", "r");
  229. txt = fd.read(fd.len());
  230. fd.close();
  231. */
  232. lex.input(txt);
  233. var tok = lex.token();
  234. while(tok) {
  235. print(tok.name, tok.value, tok.line+1, tok.pos);
  236. tok = lex.token();
  237. }