| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250 |
- class Lexer {
- pos = 0;
- line = 0;
- buf = null;
- buflen = 0;
- // Operator table, mapping operator -> token name
- static optable = {
- "+": "PLUS",
- "-": "MINUS",
- "*": "MULTIPLY",
- ".": "PERIOD",
- "\\": "BACKSLASH",
- ":": "COLON",
- "%": "PERCENT",
- "|": "PIPE",
- "!": "EXCLAMATION",
- "?": "QUESTION",
- "#": "POUND",
- "&": "AMPERSAND",
- ";": "SEMI",
- ",": "COMMA",
- "(": "L_PAREN",
- ")": "R_PAREN",
- "<": "L_ANG",
- ">": "R_ANG",
- "{": "L_BRACE",
- "}": "R_BRACE",
- "[": "L_BRACKET",
- "]": "R_BRACKET",
- "=": "EQUALS"
- };
- // Initialize the Lexer's buffer. This resets the lexer's internal
- // state and subsequent tokens will be returned starting with the
- // beginning of the new buffer.
- function input(buf)
- {
- this.pos = 0;
- this.line = 0;
- this.buf = buf;
- this.buflen = buf.len();
- }
- // Get the next token from the current buffer. A token is an object with
- // the following properties:
- // - name: name of the pattern that this token matched (taken from rules).
- // - value: actual string value of the token.
- // - pos: offset in the current buffer where the token starts.
- //
- // If there are no more tokens in the buffer, returns null. In case of
- // an error throws Error.
- function token()
- {
- var _isnewline = function(c)
- {
- return c === '\r' || c === '\n';
- }
- var _isdigit = function (c)
- {
- return c >= '0' && c <= '9';
- }
- var _isalpha = function (c)
- {
- return (c >= 'a' && c <= 'z') ||
- (c >= 'A' && c <= 'Z') ||
- c === '_' || c === '$';
- }
- var _isalphanum = function(c)
- {
- return (c >= 'a' && c <= 'z') ||
- (c >= 'A' && c <= 'Z') ||
- (c >= '0' && c <= '9') ||
- c === '_' || c === '$';
- }
- var _process_number = function()
- {
- var endpos = this.pos + 1;
- while (endpos < this.buflen &&
- _isdigit(this.buf[endpos]))
- {
- endpos++;
- }
- var tok = {
- name: "NUMBER",
- value: this.buf.slice(this.pos, endpos),
- pos: this.pos, line: this.line
- };
- this.pos = endpos;
- return tok;
- }
- var _process_comment = function ()
- {
- var endpos = this.pos + 2;
- // Skip until the end of the line
- var c = this.buf[this.pos + 2];
- while (endpos < this.buflen &&
- !_isnewline(this.buf[endpos]))
- {
- endpos++;
- }
- var tok = {
- name: "COMMENT",
- value: this.buf.slice(this.pos, endpos),
- pos: this.pos, line: this.line
- };
- this.pos = endpos + 1;
- ++this.line;
- return tok;
- }
- var _process_identifier = function()
- {
- var endpos = this.pos + 1;
- while (endpos < this.buflen &&
- _isalphanum(this.buf[endpos]))
- {
- endpos++;
- }
- var tok = {
- name: "IDENTIFIER",
- value: this.buf.slice(this.pos, endpos),
- pos: this.pos, line: this.line
- };
- this.pos = endpos;
- return tok;
- }
- var _process_quote = function(quote)
- {
- // this.pos points at the opening quote. Find the ending quote.
- var end_index = this.buf.find(quote, this.pos + 1);
- while((this.buf[end_index-1] === '\\') && (this.buf[end_index-2] !== '\\') ){
- end_index = this.buf.find(quote, end_index + 1);
- }
- if (end_index === -1)
- {
- throw ("Unterminated quote at " + this.pos);
- }
- else
- {
- var tok = {
- name: "QUOTE",
- value: this.buf.slice(this.pos, end_index + 1),
- pos: this.pos, line: this.line
- };
- this.pos = end_index + 1;
- return tok;
- }
- }
- var _skipnontokens = function()
- {
- while (this.pos < this.buflen)
- {
- var c = this.buf[this.pos];
- if (c == ' ' || c == '\t')
- {
- ++this.pos;
- }
- else if (c == '\r' || c == '\n')
- {
- ++this.pos;
- ++this.line;
- }
- else
- {
- break;
- }
- }
- }
- _skipnontokens();
- if (this.pos >= this.buflen)
- {
- return null;
- }
- // The char at this.pos is part of a real token. Figure out which.
- var c = this.buf[this.pos];
- // '/' is treated specially, because it starts a comment if followed by
- // another '/'. If not followed by another '/', it's the DIVIDE
- // operator.
- if (c === '/')
- {
- var next_c = this.buf[this.pos + 1];
- if (next_c === '/')
- {
- return _process_comment();
- }
- else
- {
- return {name: "DIVIDE", value: '/', pos: this.pos++, line: this.line};
- }
- }
- else
- {
- // Look it up in the table of operators
- var op = table_rawget(this.optable, c.tochar(), false);
- if (op)
- {
- return {name: op, value: c, pos: this.pos++, line: this.line};
- }
- else
- {
- // Not an operator - so it's the beginning of another token.
- if (_isalpha(c))
- {
- return _process_identifier();
- }
- else if (_isdigit(c))
- {
- return _process_number();
- }
- else if (c === '"' || c === '\'')
- {
- return _process_quote(c.tochar());
- }
- else
- {
- throw ("Token error at " + this.pos);
- }
- }
- }
- }
- }
- var txt = "var lex = new Lexer(23, \"dad\");";
- var lex = new Lexer();
- var fd = file("lexer.nut", "r");
- txt = fd.read(fd.len());
- fd.close();
- lex.input(txt);
- var tok = lex.token();
- while(tok) {
- print(tok.name, tok.value, tok.line+1, tok.pos);
- tok = lex.token();
- }
|