c
/
odin-lang.Odin
mirror of https://github.com/odin-lang/Odin


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655
							// package text/scanner provides a scanner and tokenizer for UTF-8-encoded text.
// It takes a string providing the source, which then can be tokenized through
// repeated calls to the scan procedure.
// For compatibility with existing tooling and languages, the NUL character is not allowed.
// If an UTF-8 encoded byte order mark (BOM) is the first character in the first character in the source, it will be discarded.
//
// By default, a Scanner skips white space and Odin comments and recognizes all literals defined by the Odin programming language specification.
// A Scanner may be customized to recognize only a subset of those literals and to recognize different identifiers and white space characters.
package text_scanner

import "core:fmt"
import "core:strings"
import "core:unicode"
import "core:unicode/utf8"

// Position represents a source position
// A position is valid if line > 0
Position :: struct {
	filename: string, // filename, if present
	offset:   int,    // byte offset, starting @ 0
	line:     int,    // line number, starting @ 1
	column:   int,    // column number, starting @ 1 (character count per line)
}

// position_is_valid reports where the position is valid
position_is_valid :: proc(pos: Position) -> bool {
	return pos.line > 0;
}

position_to_string :: proc(pos: Position, allocator := context.temp_allocator) -> string {
	s := pos.filename;
	if s == "" {
		s = "<input>";
	}

	context.allocator = allocator;
	if position_is_valid(pos) {
		return fmt.aprintf("%s(%d:%d)", s, pos.line, pos.column);
	} else {
		return strings.clone(s);
	}
}

EOF        :: -1;
Ident      :: -2;
Int        :: -3;
Float      :: -4;
Char       :: -5;
String     :: -6;
Raw_String :: -7;
Comment    :: -8;

Scan_Flag :: enum u32 {
	Scan_Idents,
	Scan_Ints,
	Scan_C_Int_Prefixes,
	Scan_Floats, // Includes integers and hexadecimal floats
	Scan_Chars,
	Scan_Strings,
	Scan_Raw_Strings,
	Scan_Comments,
	Skip_Comments, // if set with .Scan_Comments, comments become white space
}
Scan_Flags :: distinct bit_set[Scan_Flag; u32];

Odin_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments};
C_Like_Tokens    :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_C_Int_Prefixes, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments};

// Only allows for ASCII whitespace
Whitespace :: distinct bit_set['\x00'..<utf8.RUNE_SELF; u128];

// Odin_Whitespace is the default value for the Scanner's whitespace field
Odin_Whitespace :: Whitespace{'\t', '\n', '\r', ' '};
C_Whitespace    :: Whitespace{'\t', '\n', '\r', '\v', '\f', ' '};


// Scanner allows for the reading of Unicode characters and tokens from a string
Scanner :: struct {
	src: string,

	src_pos: int,
	src_end: int,

	tok_pos: int,
	tok_end: int,

	ch: rune,

	line:   int,
	column: int,
	prev_line_len: int,
	prev_char_len: int,

	// error is called for each error encountered
	// If no error procedure is set, the error is reported to os.stderr
	error: proc(s: ^Scanner, msg: string),

	// error_count is incremented by one for each error encountered
	error_count: int,

	// flags controls which tokens are recognized
	// e.g. to recognize integers, set the .Scan_Ints flag
	// This field may be changed by the user at any time during scanning
	flags: Scan_Flags,

	// The whitespace field controls which characters are recognized as white space
	// This field may be changed by the user at any time during scanning
	whitespace: Whitespace,

	// is_ident_rune is a predicate controlling the characters accepted as the ith rune in an identifier
	// The valid characters must not conflict with the set of white space characters
	// If is_ident_rune is not set, regular Odin-like identifiers are accepted
	// This field may be changed by the user at any time during scanning
	is_ident_rune: proc(ch: rune, i: int) -> bool,

	// Start position of most recently scanned token (set by scan(s))
	// Call init or next invalidates the position
	pos: Position,
}

// init initializes a scanner with a new source and returns itself.
// error_count is set to 0, flags is set to Odin_Like_Tokens, whitespace is set to Odin_Whitespace
init :: proc(s: ^Scanner, src: string, filename := "") -> ^Scanner {
	s^ = {};

	s.error_count = 0;
	s.src = src;
	s.pos.filename = filename;

	s.tok_pos = -1;

	s.ch = -2; // no char read yet, not an EOF

	s.line = 1;

	s.flags = Odin_Like_Tokens;
	s.whitespace = Odin_Whitespace;

	return s;
}


@(private)
advance :: proc(s: ^Scanner) -> rune {
	if s.src_pos >= len(s.src) {
		s.prev_char_len = 0;
		return EOF;
	}
	ch, width := rune(s.src[s.src_pos]), 1;

	if ch >= utf8.RUNE_SELF {
		ch, width = utf8.decode_rune_in_string(s.src[s.src_pos:]);
		if ch == utf8.RUNE_ERROR && width == 1 {
			s.src_pos += width;
			s.prev_char_len = width;
			s.column += 1;
			error(s, "invalid UTF-8 encoding");
			return ch;
		}
	}

	s.src_pos += width;
	s.prev_char_len = width;
	s.column += 1;

	switch ch {
	case 0:
		error(s, "invalid character NUL");
	case '\n':
		s.line += 1;
		s.prev_line_len = s.column;
		s.column = 0;
	}

	return ch;
}

// next reads and returns the next Unicode character. It returns EOF at the end of the source.
// next does not update the Scanner's pos field. Use 'position(s)' to get the current position
next :: proc(s: ^Scanner) -> rune {
	s.tok_pos = -1;
	s.pos.line = 0;
	ch := peek(s);
	if ch != EOF {
		s.ch = advance(s);
	}
	return ch;
}

// peek returns the next Unicode character in the source without advancing the scanner
// It returns EOF if the scanner's position is at least the last character of the source
// if n > 0, it call next n times and return the nth Unicode character and then restore the Scanner's state
peek :: proc(s: ^Scanner, n := 0) -> (ch: rune) {
	if s.ch == -2 {
		s.ch = advance(s);
		if s.ch == '\ufeff' { // Ignore BOM
			s.ch = advance(s);
		}
	}
	ch = s.ch;
	if n > 0 {
		prev_s := s^;
		for in 0..<n {
			next(s);
		}
		ch = s.ch;
		s^ = prev_s;
	}
	return ch;
}
// peek returns the next token in the source
// It returns EOF if the scanner's position is at least the last character of the source
// if n > 0, it call next n times and return the nth token and then restore the Scanner's state
peek_token :: proc(s: ^Scanner, n := 0) -> (tok: rune) {
	assert(n >= 0);
	prev_s := s^;
	for in 0..<n {
		tok = scan(s);
	}
	tok = scan(s);
	s^ = prev_s;
	return;
}

error :: proc(s: ^Scanner, msg: string) {
	s.error_count += 1;
	if s.error != nil {
		s.error(s, msg);
		return;
	}
	p := s.pos;
	if !position_is_valid(p) {
		p = position(s);
	}

	s := p.filename;
	if s == "" {
		s = "<input>";
	}

	if position_is_valid(p) {
		fmt.eprintf("%s(%d:%d): %s\n", s, p.line, p.column, msg);
	} else {
		fmt.eprintf("%s: %s\n", s, msg);
	}
}

errorf :: proc(s: ^Scanner, format: string, args: ..any) {
	error(s, fmt.tprintf(format, ..args));
}

@(private)
is_ident_rune :: proc(s: ^Scanner, ch: rune, i: int) -> bool {
	if s.is_ident_rune != nil {
		return s.is_ident_rune(ch, i);
	}
	return ch == '_' || unicode.is_letter(ch) || unicode.is_digit(ch) && i > 0;
}

@(private)
scan_identifier :: proc(s: ^Scanner) -> rune {
	ch := advance(s);
	for i := 1; is_ident_rune(s, ch, i); i += 1 {
		ch = advance(s);
	}
	return ch;
}

@(private) lower      :: proc(ch: rune) -> rune { return ('a' - 'A') | ch; }
@(private) is_decimal :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9'; }
@(private) is_hex     :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f'; }


@(private)
scan_number :: proc(s: ^Scanner, ch: rune, seen_dot: bool) -> (rune, rune) {
	lit_name :: proc(prefix: rune) -> string {
		switch prefix {
		case 'b': return "binary literal";
		case 'o': return "octal literal";
		case 'z': return "dozenal literal";
		case 'x': return "hexadecimal literal";
		}
		return "decimal literal";
	}

	digits :: proc(s: ^Scanner, ch0: rune, base: int, invalid: ^rune) -> (ch: rune, digsep: int) {
		ch = ch0;
		if base <= 10 {
			max := rune('0' + base);
			for is_decimal(ch) || ch == '_' {
				ds := 1;
				if ch == '_' {
					ds = 2;
				} else if ch >= max && invalid^ == 0 {
					invalid^ = ch;
				}
				digsep |= ds;
				ch = advance(s);
			}
		} else {
			for is_hex(ch) || ch == '_' {
				ds := 1;
				if ch == '_' {
					ds = 2;
				}
				digsep |= ds;
				ch = advance(s);
			}
		}
		return;
	}

	ch, seen_dot := ch, seen_dot;

	base := 10;
	prefix := rune(0);
	digsep := 0;
	invalid := rune(0);

	tok: rune;
	ds: int;

	if !seen_dot {
		tok = Int;
		if ch == '0' {
			ch = advance(s);

			p := lower(ch);
			if .Scan_C_Int_Prefixes in s.flags {
				switch p {
				case 'b':
					ch = advance(s);
					base, prefix = 2, 'b';
				case 'x':
					ch = advance(s);
					base, prefix = 16, 'x';
				case:
					base, prefix = 8, 'o';
					digsep = 1; // Leading zero
				}
			} else {
				switch p {
				case 'b':
					ch = advance(s);
					base, prefix = 2, 'b';
				case 'o':
					ch = advance(s);
					base, prefix = 8, 'o';
				case 'd':
					ch = advance(s);
					base, prefix = 10, 'd';
				case 'z':
					ch = advance(s);
					base, prefix = 12, 'z';
				case 'h':
					tok = Float;
					fallthrough;
				case 'x':
					ch = advance(s);
					base, prefix = 16, 'x';
				case:
					digsep = 1; // Leading zero
				}
			}
		}

		ch, ds = digits(s, ch, base, &invalid);
		digsep |= ds;
		if ch == '.' && .Scan_Floats in s.flags {
			ch = advance(s);
			seen_dot = true;
		}
	}

	if seen_dot {
		tok = Float;
		if prefix != 0 && prefix != 'x' {
			errorf(s, "invalid radix point in %s", lit_name(prefix));
		}
		ch, ds = digits(s, ch, base, &invalid);
		digsep |= ds;
	}

	if digsep&1 == 0 {
		errorf(s, "%s has no digits", lit_name(prefix));
	}

	if e := lower(ch); (e == 'e' || e == 'p') && .Scan_Floats in s.flags {
		switch {
		case e == 'e' && prefix != 0:
			errorf(s, "%q exponent requires decimal mantissa", ch);
		case e == 'p' && prefix != 'x':
			errorf(s, "%q exponent requires hexadecimal mantissa", ch);
		}
		ch = advance(s);
		tok = Float;
		if ch == '+' || ch == '-' {
			ch = advance(s);
		}
		ch, ds = digits(s, ch, 10, nil);
		digsep |= ds;
		if ds&1 == 0 {
			error(s, "exponent has no digits");
		}
	} else if prefix == 'x' && tok == Float {
		error(s, "hexadecimal mantissa requires a 'p' exponent");
	}

	if tok == Int && invalid != 0 {
		errorf(s, "invalid digit %q in %s", invalid, lit_name(prefix));
	}

	if digsep&2 != 0 {
		s.tok_end = s.src_pos - s.prev_char_len;
	}
	return tok, ch;
}

@(private)
scan_string :: proc(s: ^Scanner, quote: rune) -> (n: int) {
	digit_val :: proc(ch: rune) -> int {
		switch v := lower(ch); v {
		case '0'..'9': return int(v - '0');
		case 'a'..'z': return int(v - 'a');
		}
		return 16;
	}

	scan_digits :: proc(s: ^Scanner, ch: rune, base, n: int) -> rune {
		ch, n := ch, n;
		for n > 0 && digit_val(ch) < base {
			ch = advance(s);
			n -= 1;
		}
		if n > 0 {
			error(s, "invalid char escape");
		}
		return ch;
	}

	ch := advance(s);
	for ch != quote {
		if ch == '\n' || ch < 0 {
			error(s, "literal no terminated");
			return;
		}
		if ch == '\\' {
			ch = advance(s);
			switch ch {
			case quote, 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v', '\\':
				ch = advance(s);
			case '0'..'7': ch = scan_digits(s, advance(s), 8, 3);
			case 'x':      ch = scan_digits(s, advance(s), 16, 2);
			case 'u':      ch = scan_digits(s, advance(s), 16, 4);
			case 'U':      ch = scan_digits(s, advance(s), 16, 8);
			case:
				error(s, "invalid char escape");
			}
		} else {
			ch = advance(s);
		}
		n += 1;
	}
	return;
}

@(private)
scan_raw_string :: proc(s: ^Scanner) {
	ch := advance(s);
	for ch != '`' {
		if ch < 0 {
			error(s, "literal not terminated");
			return;
		}
		ch = advance(s);
	}
}

@(private)
scan_char :: proc(s: ^Scanner) {
	if scan_string(s, '\'') != 1 {
		error(s, "invalid char literal");
	}
}

@(private)
scan_comment :: proc(s: ^Scanner, ch: rune) -> rune {
	ch := ch;
	if ch == '/' { // line comment
		ch = advance(s);
		for ch != '\n' && ch >= 0 {
			ch = advance(s);
		}
		return ch;
	}

	// block /**/ comment
	ch = advance(s);
	for {
		if ch < 0 {
			error(s, "comment not terminated");
			break;
		}
		ch0 := ch;
		ch = advance(s);
		if ch0 == '*' && ch == '/' {
			return advance(s);
		}
	}
	return ch;
}

// scan reads the next token or Unicode character from source and returns it
// It only recognizes tokens for which the respective flag that is set
// It returns EOF at the end of the source
// It reports Scanner errors by calling s.error, if not nil; otherwise it will print the error message to os.stderr
scan :: proc(s: ^Scanner) -> (tok: rune) {
	ch := peek(s);
	if ch == EOF {
		return ch;
	}

	// reset position
	s.tok_pos = -1;
	s.pos.line = 0;

	redo: for {
		for (ch < utf8.RUNE_SELF && ch in s.whitespace) {
			ch = advance(s);
		}

		s.tok_pos = s.src_pos - s.prev_char_len;
		s.pos.offset = s.tok_pos;

		if s.column > 0 {
			s.pos.line = s.line;
			s.pos.column = s.column;
		} else {
			// previous character was newline
			s.pos.line = s.line - 1;
			s.pos.column = s.prev_line_len;
		}

		tok = ch;
		if is_ident_rune(s, ch, 0) {
			if .Scan_Idents in s.flags {
				tok = Ident;
				ch = scan_identifier(s);
			} else {
				ch = advance(s);
			}

		} else if is_decimal(ch) {
			if .Scan_Ints in s.flags || .Scan_Floats in s.flags {
				tok, ch = scan_number(s, ch, false);
			} else {
				ch = advance(s);
			}
		} else {
			switch ch {
			case EOF:
				break;
			case '"':
				if .Scan_Strings in s.flags {
					scan_string(s, '"');
					tok = String;
				}
				ch = advance(s);
			case '\'':
				if .Scan_Chars in s.flags {
					scan_string(s, '\'');
					tok = Char;
				}
				ch = advance(s);
			case '`':
				if .Scan_Raw_Strings in s.flags {
					scan_raw_string(s);
					tok = Raw_String;
				}
				ch = advance(s);
			case '.':
				ch = advance(s);
				if is_decimal(ch) && .Scan_Floats in s.flags {
					tok, ch = scan_number(s, ch, true);
				}
			case '/':
				ch = advance(s);
				if (ch == '/' || ch == '*') && .Scan_Comments in s.flags {
					if .Skip_Comments in s.flags {
						s.tok_pos = -1;
						ch = scan_comment(s, ch);
						continue redo;
					}
					ch = scan_comment(s, ch);
					tok = Comment;
				}
			case:
				ch = advance(s);
			}
		}

		break redo;
	}

	s.tok_end = s.src_pos - s.prev_char_len;

	s.ch = ch;
	return tok;
}

// position returns the position of the character immediately after the character or token returns by the previous call to next or scan
// Use the Scanner's position field for the most recently scanned token position
position :: proc(s: ^Scanner) -> Position {
	pos: Position;
	pos.filename = s.pos.filename;
	pos.offset = s.src_pos - s.prev_char_len;
	switch {
	case s.column > 0:
		pos.line = s.line;
		pos.column = s.column;
	case s.prev_line_len > 0:
		pos.line = s.line-1;
		pos.column = s.prev_line_len;
	case:
		pos.line = 1;
		pos.column = 1;
	}
	return pos;
}

// token_text returns the string of the most recently scanned token
token_text :: proc(s: ^Scanner) -> string {
	if s.tok_pos < 0 {
		return "";
	}
	return string(s.src[s.tok_pos:s.tok_end]);
}

// token_string returns a printable string for a token or Unicode character
// By default, it uses the context.temp_allocator to produce the string
token_string :: proc(tok: rune, allocator := context.temp_allocator) -> string {
	context.allocator = allocator;
	switch tok {
	case EOF:        return strings.clone("EOF");
	case Ident:      return strings.clone("Ident");
	case Int:        return strings.clone("Int");
	case Float:      return strings.clone("Float");
	case Char:       return strings.clone("Char");
	case String:     return strings.clone("String");
	case Raw_String: return strings.clone("Raw_String");
	case Comment:    return strings.clone("Comment");
	}
	return fmt.aprintf("%q", tok);
}