cpp
/
squilu
mirror of https://github.com/mingodad/squilu.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
							class Lexer {
	pos = 0;
	line = 0;
	buf = null;
	buflen = 0;

	// Operator table, mapping operator -> token name
	static optable = {
		"+":  "PLUS",
		"-":  "MINUS",
		"*":  "MULTIPLY",
		".":  "PERIOD",
		"\\": "BACKSLASH",
		":":  "COLON",
		"%":  "PERCENT",
		"|":  "PIPE",
		"!":  "EXCLAMATION",
		"?":  "QUESTION",
		"#":  "POUND",
		"&":  "AMPERSAND",
		";":  "SEMI",
		",":  "COMMA",
		"(":  "L_PAREN",
		")":  "R_PAREN",
		"<":  "L_ANG",
		">":  "R_ANG",
		"{":  "L_BRACE",
		"}":  "R_BRACE",
		"[":  "L_BRACKET",
		"]":  "R_BRACKET",
		"=":  "EQUALS"
	};


    // Initialize the Lexer's buffer. This resets the lexer's internal
    // state and subsequent tokens will be returned starting with the
    // beginning of the new buffer.
    function input(buf)
    {
        this.pos = 0;
	this.line = 0;
        this.buf = buf;
        this.buflen = buf.len();
    }

    // Get the next token from the current buffer. A token is an object with
    // the following properties:
    // - name: name of the pattern that this token matched (taken from rules).
    // - value: actual string value of the token.
    // - pos: offset in the current buffer where the token starts.
    //
    // If there are no more tokens in the buffer, returns null. In case of
    // an error throws Error.
    function token()
    {
        var _isnewline = function(c)
        {
            return c === '\r' || c === '\n';
        }

        var _isdigit = function (c)
        {
            return c >= '0' && c <= '9';
        }

        var _isalpha = function (c)
        {
            return (c >= 'a' && c <= 'z') ||
                   (c >= 'A' && c <= 'Z') ||
                   c === '_' || c === '$';
        }

        var _isalphanum = function(c)
        {
            return (c >= 'a' && c <= 'z') ||
                   (c >= 'A' && c <= 'Z') ||
                   (c >= '0' && c <= '9') ||
                   c === '_' || c === '$';
        }

        var _process_number = function()
        {
            var endpos = this.pos + 1;
            while (endpos < this.buflen &&
                    _isdigit(this.buf[endpos]))
            {
                endpos++;
            }

            var tok = {
		name: "NUMBER",
		value:  this.buf.slice(this.pos, endpos),
		pos: this.pos, line: this.line
            };
            this.pos = endpos;
            return tok;
        }

        var _process_comment = function ()
        {
            var endpos = this.pos + 2;
            // Skip until the end of the line
            var c = this.buf[this.pos + 2];
            while (endpos < this.buflen &&
                    !_isnewline(this.buf[endpos]))
            {
                endpos++;
            }

            var tok = {
		name: "COMMENT",
		value: this.buf.slice(this.pos, endpos),
		pos: this.pos, line: this.line
            };
            this.pos = endpos + 1;
	    ++this.line;
            return tok;
        }

        var _process_identifier = function()
        {
            var endpos = this.pos + 1;
            while (endpos < this.buflen &&
                    _isalphanum(this.buf[endpos]))
            {
                endpos++;
            }

            var tok = {
		name: "IDENTIFIER",
		value: this.buf.slice(this.pos, endpos),
		pos:  this.pos, line: this.line
            };
            this.pos = endpos;
            return tok;
        }

        var _process_quote = function(quote)
        {
            // this.pos points at the opening quote. Find the ending quote.
            var end_index = this.buf.find(quote, this.pos + 1);
	    while((this.buf[end_index-1] === '\\') && (this.buf[end_index-2] !== '\\')  ){
		end_index = this.buf.find(quote, end_index + 1);
	    }

            if (end_index === -1)
            {
                throw ("Unterminated quote at " + this.pos);
            }
            else
            {
                var tok = {
			name: "QUOTE",
			value: this.buf.slice(this.pos, end_index + 1),
			pos: this.pos, line: this.line
                };
                this.pos = end_index + 1;
                return tok;
            }
        }

        var _skipnontokens = function()
        {
            while (this.pos < this.buflen)
            {
                var c = this.buf[this.pos];
                if (c == ' ' || c == '\t')
                {
                    ++this.pos;
                }
                else if (c == '\r' || c == '\n')
                {
                    ++this.pos;
		    ++this.line;
                }
                else
                {
                    break;
                }
            }
        }

        _skipnontokens();
        if (this.pos >= this.buflen)
        {
            return null;
        }

        // The char at this.pos is part of a real token. Figure out which.
        var c = this.buf[this.pos];

        // '/' is treated specially, because it starts a comment if followed by
        // another '/'. If not followed by another '/', it's the DIVIDE
        // operator.
        if (c === '/')
        {
            var next_c = this.buf[this.pos + 1];
            if (next_c === '/')
            {
                return _process_comment();
            }
            else
            {
                return {name: "DIVIDE", value: '/', pos: this.pos++, line: this.line};
            }
        }
        else
        {
            // Look it up in the table of operators
            var op = table_rawget(this.optable, c.tochar(), false);
            if (op)
            {
                return {name: op, value: c, pos: this.pos++, line: this.line};
            }
            else
            {
                // Not an operator - so it's the beginning of another token.
                if (_isalpha(c))
                {
                    return _process_identifier();
                }
                else if (_isdigit(c))
                {
                    return _process_number();
                }
                else if (c === '"' || c === '\'')
                {
                    return _process_quote(c.tochar());
                }
                else
                {
                    throw ("Token error at " + this.pos);
                }
            }
        }
    }
}

var txt = "var lex = new Lexer(23, \"dad\");";
var lex = new Lexer();
var fd = file("lexer.nut", "r");
txt = fd.read(fd.len());
fd.close();

lex.input(txt);
var tok =  lex.token();
while(tok) {
	print(tok.name, tok.value, tok.line+1, tok.pos);
	tok = lex.token();
}