lexer.nut 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. class Lexer {
  2. pos = 0;
  3. line = 0;
  4. buf = null;
  5. buflen = 0;
  6. // Operator table, mapping operator -> token name
  7. static optable = {
  8. "+": "PLUS",
  9. "-": "MINUS",
  10. "*": "MULTIPLY",
  11. ".": "PERIOD",
  12. "\\": "BACKSLASH",
  13. ":": "COLON",
  14. "%": "PERCENT",
  15. "|": "PIPE",
  16. "!": "EXCLAMATION",
  17. "?": "QUESTION",
  18. "#": "POUND",
  19. "&": "AMPERSAND",
  20. ";": "SEMI",
  21. ",": "COMMA",
  22. "(": "L_PAREN",
  23. ")": "R_PAREN",
  24. "<": "L_ANG",
  25. ">": "R_ANG",
  26. "{": "L_BRACE",
  27. "}": "R_BRACE",
  28. "[": "L_BRACKET",
  29. "]": "R_BRACKET",
  30. "=": "EQUALS"
  31. };
  32. // Initialize the Lexer's buffer. This resets the lexer's internal
  33. // state and subsequent tokens will be returned starting with the
  34. // beginning of the new buffer.
  35. function input(buf)
  36. {
  37. this.pos = 0;
  38. this.line = 0;
  39. this.buf = buf;
  40. this.buflen = buf.len();
  41. }
  42. // Get the next token from the current buffer. A token is an object with
  43. // the following properties:
  44. // - name: name of the pattern that this token matched (taken from rules).
  45. // - value: actual string value of the token.
  46. // - pos: offset in the current buffer where the token starts.
  47. //
  48. // If there are no more tokens in the buffer, returns null. In case of
  49. // an error throws Error.
  50. function token()
  51. {
  52. var _isnewline = function(c)
  53. {
  54. return c === '\r' || c === '\n';
  55. }
  56. var _isdigit = function (c)
  57. {
  58. return c >= '0' && c <= '9';
  59. }
  60. var _isalpha = function (c)
  61. {
  62. return (c >= 'a' && c <= 'z') ||
  63. (c >= 'A' && c <= 'Z') ||
  64. c === '_' || c === '$';
  65. }
  66. var _isalphanum = function(c)
  67. {
  68. return (c >= 'a' && c <= 'z') ||
  69. (c >= 'A' && c <= 'Z') ||
  70. (c >= '0' && c <= '9') ||
  71. c === '_' || c === '$';
  72. }
  73. var _process_number = function()
  74. {
  75. var endpos = this.pos + 1;
  76. while (endpos < this.buflen &&
  77. _isdigit(this.buf[endpos]))
  78. {
  79. endpos++;
  80. }
  81. var tok = {
  82. name: "NUMBER",
  83. value: this.buf.slice(this.pos, endpos),
  84. pos: this.pos, line: this.line
  85. };
  86. this.pos = endpos;
  87. return tok;
  88. }
  89. var _process_comment = function ()
  90. {
  91. var endpos = this.pos + 2;
  92. // Skip until the end of the line
  93. var c = this.buf[this.pos + 2];
  94. while (endpos < this.buflen &&
  95. !_isnewline(this.buf[endpos]))
  96. {
  97. endpos++;
  98. }
  99. var tok = {
  100. name: "COMMENT",
  101. value: this.buf.slice(this.pos, endpos),
  102. pos: this.pos, line: this.line
  103. };
  104. this.pos = endpos + 1;
  105. ++this.line;
  106. return tok;
  107. }
  108. var _process_identifier = function()
  109. {
  110. var endpos = this.pos + 1;
  111. while (endpos < this.buflen &&
  112. _isalphanum(this.buf[endpos]))
  113. {
  114. endpos++;
  115. }
  116. var tok = {
  117. name: "IDENTIFIER",
  118. value: this.buf.slice(this.pos, endpos),
  119. pos: this.pos, line: this.line
  120. };
  121. this.pos = endpos;
  122. return tok;
  123. }
  124. var _process_quote = function(quote)
  125. {
  126. // this.pos points at the opening quote. Find the ending quote.
  127. var end_index = this.buf.find(quote, this.pos + 1);
  128. while((this.buf[end_index-1] === '\\') && (this.buf[end_index-2] !== '\\') ){
  129. end_index = this.buf.find(quote, end_index + 1);
  130. }
  131. if (end_index === -1)
  132. {
  133. throw ("Unterminated quote at " + this.pos);
  134. }
  135. else
  136. {
  137. var tok = {
  138. name: "QUOTE",
  139. value: this.buf.slice(this.pos, end_index + 1),
  140. pos: this.pos, line: this.line
  141. };
  142. this.pos = end_index + 1;
  143. return tok;
  144. }
  145. }
  146. var _skipnontokens = function()
  147. {
  148. while (this.pos < this.buflen)
  149. {
  150. var c = this.buf[this.pos];
  151. if (c == ' ' || c == '\t')
  152. {
  153. ++this.pos;
  154. }
  155. else if (c == '\r' || c == '\n')
  156. {
  157. ++this.pos;
  158. ++this.line;
  159. }
  160. else
  161. {
  162. break;
  163. }
  164. }
  165. }
  166. _skipnontokens();
  167. if (this.pos >= this.buflen)
  168. {
  169. return null;
  170. }
  171. // The char at this.pos is part of a real token. Figure out which.
  172. var c = this.buf[this.pos];
  173. // '/' is treated specially, because it starts a comment if followed by
  174. // another '/'. If not followed by another '/', it's the DIVIDE
  175. // operator.
  176. if (c === '/')
  177. {
  178. var next_c = this.buf[this.pos + 1];
  179. if (next_c === '/')
  180. {
  181. return _process_comment();
  182. }
  183. else
  184. {
  185. return {name: "DIVIDE", value: '/', pos: this.pos++, line: this.line};
  186. }
  187. }
  188. else
  189. {
  190. // Look it up in the table of operators
  191. var op = table_rawget(this.optable, c.tochar(), false);
  192. if (op)
  193. {
  194. return {name: op, value: c, pos: this.pos++, line: this.line};
  195. }
  196. else
  197. {
  198. // Not an operator - so it's the beginning of another token.
  199. if (_isalpha(c))
  200. {
  201. return _process_identifier();
  202. }
  203. else if (_isdigit(c))
  204. {
  205. return _process_number();
  206. }
  207. else if (c === '"' || c === '\'')
  208. {
  209. return _process_quote(c.tochar());
  210. }
  211. else
  212. {
  213. throw ("Token error at " + this.pos);
  214. }
  215. }
  216. }
  217. }
  218. }
  219. var txt = "var lex = new Lexer(23, \"dad\");";
  220. var lex = new Lexer();
  221. var fd = file("lexer.nut", "r");
  222. txt = fd.read(fd.len());
  223. fd.close();
  224. lex.input(txt);
  225. var tok = lex.token();
  226. while(tok) {
  227. print(tok.name, tok.value, tok.line+1, tok.pos);
  228. tok = lex.token();
  229. }