123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621 |
- package odin_tokenizer
- import "core:fmt"
- import "core:odin/token"
- import "core:unicode/utf8"
- Error_Handler :: #type proc(pos: token.Pos, fmt: string, args: ..any);
- Tokenizer :: struct {
- // Immutable data
- path: string,
- src: []byte,
- err: Error_Handler,
- // Tokenizing state
- ch: rune,
- offset: int,
- read_offset: int,
- line_offset: int,
- line_count: int,
- // Mutable data
- error_count: int,
- }
- init :: proc(t: ^Tokenizer, src: []byte, path: string, err: Error_Handler = default_error_handler) {
- t.src = src;
- t.err = err;
- t.ch = ' ';
- t.offset = 0;
- t.read_offset = 0;
- t.line_offset = 0;
- t.line_count = len(src) > 0 ? 1 : 0;
- t.error_count = 0;
- t.path = path;
- advance_rune(t);
- if t.ch == utf8.RUNE_BOM {
- advance_rune(t);
- }
- }
- @(private)
- offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> token.Pos {
- line := t.line_count;
- column := offset - t.line_offset + 1;
- return token.Pos {
- file = t.path,
- offset = offset,
- line = line,
- column = column,
- };
- }
- default_error_handler :: proc(pos: token.Pos, msg: string, args: ..any) {
- fmt.printf_err("%s(%d:%d) ", pos.file, pos.line, pos.column);
- fmt.printf_err(msg, ..args);
- fmt.printf_err("\n");
- }
- error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
- pos := offset_to_pos(t, offset);
- if t.err != nil {
- t.err(pos, msg, ..args);
- }
- t.error_count += 1;
- }
- advance_rune :: proc(using t: ^Tokenizer) {
- if read_offset < len(src) {
- offset = read_offset;
- if ch == '\n' {
- line_offset = offset;
- line_count += 1;
- }
- r, w := rune(src[read_offset]), 1;
- switch {
- case r == 0:
- error(t, t.offset, "illegal character NUL");
- case r >= utf8.RUNE_SELF:
- r, w = utf8.decode_rune(src[read_offset:]);
- if r == utf8.RUNE_ERROR && w == 1 {
- error(t, t.offset, "illegal UTF-8 encoding");
- } else if r == utf8.RUNE_BOM && offset > 0 {
- error(t, t.offset, "illegal byte order mark");
- }
- }
- read_offset += w;
- ch = r;
- } else {
- offset = len(src);
- if ch == '\n' {
- line_offset = offset;
- line_count += 1;
- }
- ch = -1;
- }
- }
- peek_byte :: proc(using t: ^Tokenizer, offset := 0) -> byte {
- if read_offset+offset < len(src) {
- return src[read_offset+offset];
- }
- return 0;
- }
- skip_whitespace :: proc(t: ^Tokenizer) {
- for t.ch == ' ' ||
- t.ch == '\t' ||
- t.ch == '\n' ||
- t.ch == '\r' {
- advance_rune(t);
- }
- }
- is_letter :: proc(r: rune) -> bool {
- if r < utf8.RUNE_SELF {
- switch r {
- case '_':
- return true;
- case 'A'..'Z', 'a'..'z':
- return true;
- }
- }
- // TODO(bill): Add unicode lookup tables
- return false;
- }
- is_digit :: proc(r: rune) -> bool {
- // TODO(bill): Add unicode lookup tables
- return '0' <= r && r <= '9';
- }
- scan_comment :: proc(t: ^Tokenizer) -> string {
- offset := t.offset-1;
- next := -1;
- general: {
- if t.ch == '/' || t.ch == '!' { // // #! comments
- advance_rune(t);
- for t.ch != '\n' && t.ch >= 0 {
- advance_rune(t);
- }
- next = t.offset;
- if t.ch == '\n' {
- next += 1;
- }
- break general;
- }
- /* style comment */
- advance_rune(t);
- for t.ch >= 0 {
- ch := t.ch;
- advance_rune(t);
- if ch == '*' && t.ch == '/' {
- advance_rune(t);
- next = t.offset;
- break general;
- }
- }
- error(t, offset, "comment not terminated");
- }
- lit := t.src[offset : t.offset];
- // NOTE(bill): Strip CR for line comments
- for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
- lit = lit[:len(lit)-1];
- }
- return string(lit);
- }
- scan_identifier :: proc(t: ^Tokenizer) -> string {
- offset := t.offset;
- for is_letter(t.ch) || is_digit(t.ch) {
- advance_rune(t);
- }
- return string(t.src[offset : t.offset]);
- }
- scan_string :: proc(t: ^Tokenizer) -> string {
- offset := t.offset-1;
- for {
- ch := t.ch;
- if ch == '\n' || ch < 0 {
- error(t, offset, "string literal was not terminated");
- break;
- }
- advance_rune(t);
- if ch == '"' {
- break;
- }
- if ch == '\\' {
- scan_escape(t);
- }
- }
- return string(t.src[offset : t.offset]);
- }
- scan_raw_string :: proc(t: ^Tokenizer) -> string {
- offset := t.offset-1;
- for {
- ch := t.ch;
- if ch == '\n' || ch < 0 {
- error(t, offset, "raw string literal was not terminated");
- break;
- }
- advance_rune(t);
- if ch == '`' {
- break;
- }
- }
- return string(t.src[offset : t.offset]);
- }
- digit_val :: proc(r: rune) -> int {
- switch r {
- case '0'..'9':
- return int(r-'0');
- case 'A'..'F':
- return int(r-'A' + 10);
- case 'a'..'f':
- return int(r-'a' + 10);
- }
- return 16;
- }
- scan_escape :: proc(t: ^Tokenizer) -> bool {
- offset := t.offset;
- n: int;
- base, max: u32;
- switch t.ch {
- case 'a', 'b', 'e', 'f', 'n', 't', 'v', '\\', '\'', '\"':
- advance_rune(t);
- return true;
- case '0'..'7':
- n, base, max = 3, 8, 255;
- case 'x':
- advance_rune(t);
- n, base, max = 2, 16, 255;
- case 'u':
- advance_rune(t);
- n, base, max = 4, 16, utf8.MAX_RUNE;
- case 'U':
- advance_rune(t);
- n, base, max = 8, 16, utf8.MAX_RUNE;
- case:
- if t.ch < 0 {
- error(t, offset, "escape sequence was not terminated");
- } else {
- error(t, offset, "unknown escape sequence");
- }
- return false;
- }
- x: u32;
- for n > 0 {
- d := u32(digit_val(t.ch));
- for d >= base {
- if t.ch < 0 {
- error(t, t.offset, "escape sequence was not terminated");
- } else {
- error(t, t.offset, "illegal character %d in escape sequence", t.ch);
- }
- return false;
- }
- x = x*base + d;
- advance_rune(t);
- n -= 1;
- }
- if x > max || 0xd800 <= x && x <= 0xe000 {
- error(t, offset, "escape sequence is an invalid Unicode code point");
- return false;
- }
- return true;
- }
- scan_rune :: proc(t: ^Tokenizer) -> string {
- offset := t.offset-1;
- valid := true;
- n := 0;
- for {
- ch := t.ch;
- if ch == '\n' || ch < 0 {
- if valid {
- error(t, offset, "rune literal not terminated");
- valid = false;
- }
- break;
- }
- advance_rune(t);
- if ch == '\'' {
- break;
- }
- n += 1;
- if ch == '\\' {
- if !scan_escape(t) {
- valid = false;
- }
- }
- }
- if valid && n != 1 {
- error(t, offset, "illegal rune literal");
- }
- return string(t.src[offset : t.offset]);
- }
- scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (token.Kind, string) {
- scan_mantissa :: proc(t: ^Tokenizer, base: int) {
- for digit_val(t.ch) < base || t.ch == '_' {
- advance_rune(t);
- }
- }
- scan_exponent :: proc(t: ^Tokenizer, kind: ^token.Kind) {
- if t.ch == 'e' || t.ch == 'E' {
- kind^ = token.Float;
- advance_rune(t);
- if t.ch == '-' || t.ch == '+' {
- advance_rune(t);
- }
- if digit_val(t.ch) < 10 {
- scan_mantissa(t, 10);
- } else {
- error(t, t.offset, "illegal floating-point exponent");
- }
- }
- // NOTE(bill): This needs to be here for sanity's sake
- if t.ch == 'i' {
- kind^ = token.Imag;
- advance_rune(t);
- }
- }
- scan_fraction :: proc(t: ^Tokenizer, kind: ^token.Kind) -> (early_exit: bool) {
- if t.ch == '.' && peek_byte(t) == '.' {
- return true;
- }
- if t.ch == '.' {
- kind^ = token.Float;
- advance_rune(t);
- scan_mantissa(t, 10);
- }
- return false;
- }
- offset := t.offset;
- kind := token.Integer;
- if seen_decimal_point {
- offset -= 1;
- kind = token.Float;
- scan_mantissa(t, 10);
- scan_exponent(t, &kind);
- } else {
- if t.ch == '0' {
- int_base :: inline proc(t: ^Tokenizer, kind: ^token.Kind, base: int, msg: string) {
- prev := t.offset;
- advance_rune(t);
- scan_mantissa(t, base);
- if t.offset - prev <= 1 {
- kind^ = token.Invalid;
- error(t, t.offset, msg);
- }
- }
- advance_rune(t);
- switch t.ch {
- case 'b': int_base(t, &kind, 2, "illegal binary integer");
- case 'o': int_base(t, &kind, 8, "illegal octal integer");
- case 'd': int_base(t, &kind, 10, "illegal decimal integer");
- case 'z': int_base(t, &kind, 12, "illegal dozenal integer");
- case 'x': int_base(t, &kind, 16, "illegal hexadecimal integer");
- case 'h':
- prev := t.offset;
- advance_rune(t);
- scan_mantissa(t, 16);
- if t.offset - prev <= 1 {
- kind = token.Invalid;
- error(t, t.offset, "illegal hexadecimal floating-point number");
- } else {
- sub := t.src[prev+1 : t.offset];
- digit_count := 0;
- for d in sub {
- if d != '_' {
- digit_count += 1;
- }
- }
- switch digit_count {
- case 8, 16: break;
- case:
- error(t, t.offset, "invalid hexadecimal floating-point number, expected 8 or 16 digits, got %d", digit_count);
- }
- }
- case:
- seen_decimal_point = false;
- scan_mantissa(t, 10);
- if t.ch == '.' {
- seen_decimal_point = true;
- if scan_fraction(t, &kind) {
- return kind, string(t.src[offset : t.offset]);
- }
- }
- scan_exponent(t, &kind);
- return kind, string(t.src[offset : t.offset]);
- }
- }
- }
- scan_mantissa(t, 10);
- if scan_fraction(t, &kind) {
- return kind, string(t.src[offset : t.offset]);
- }
- scan_exponent(t, &kind);
- return kind, string(t.src[offset : t.offset]);
- }
- scan :: proc(t: ^Tokenizer) -> token.Token {
- switch2 :: proc(t: ^Tokenizer, tok0, tok1: token.Kind) -> token.Kind {
- if t.ch == '=' {
- advance_rune(t);
- return tok1;
- }
- return tok0;
- }
- switch3 :: proc(t: ^Tokenizer, tok0, tok1: token.Kind, ch2: rune, tok2: token.Kind) -> token.Kind {
- if t.ch == '=' {
- advance_rune(t);
- return tok1;
- }
- if t.ch == ch2 {
- advance_rune(t);
- return tok2;
- }
- return tok0;
- }
- switch4 :: proc(t: ^Tokenizer, tok0, tok1: token.Kind, ch2: rune, tok2, tok3: token.Kind) -> token.Kind {
- if t.ch == '=' {
- advance_rune(t);
- return tok1;
- }
- if t.ch == ch2 {
- advance_rune(t);
- if t.ch == '=' {
- advance_rune(t);
- return tok3;
- }
- return tok2;
- }
- return tok0;
- }
- skip_whitespace(t);
- offset := t.offset;
- kind: token.Kind;
- lit: string;
- pos := offset_to_pos(t, offset);
- switch ch := t.ch; true {
- case is_letter(ch):
- lit = scan_identifier(t);
- kind = token.Ident;
- check_keyword: if len(lit) > 1 {
- // TODO(bill): Maybe have a hash table lookup rather than this linear search
- for i in token.B_Keyword_Begin .. token.B_Keyword_End {
- if lit == token.tokens[i] {
- kind = token.Kind(i);
- break check_keyword;
- }
- }
- for keyword, i in token.custom_keyword_tokens {
- if lit == keyword {
- kind = token.Kind(i+1)+token.B_Custom_Keyword_Begin;
- break check_keyword;
- }
- }
- }
- case '0' <= ch && ch <= '9':
- kind, lit = scan_number(t, false);
- case:
- advance_rune(t);
- switch ch {
- case -1:
- kind = token.EOF;
- case '"':
- kind = token.String;
- lit = scan_string(t);
- case '\'':
- kind = token.Rune;
- lit = scan_rune(t);
- case '`':
- kind = token.String;
- lit = scan_raw_string(t);
- case '=':
- if t.ch == '>' {
- advance_rune(t);
- kind = token.Double_Arrow_Right;
- } else {
- kind = switch2(t, token.Eq, token.Cmp_Eq);
- }
- case '!': kind = switch2(t, token.Not, token.Not_Eq);
- case '#':
- kind = token.Hash;
- if t.ch == '!' {
- kind = token.Comment;
- lit = scan_comment(t);
- }
- case '?': kind = token.Question;
- case '@': kind = token.At;
- case '$': kind = token.Dollar;
- case '^': kind = token.Pointer;
- case '+': kind = switch2(t, token.Add, token.Add_Eq);
- case '-':
- if t.ch == '>' {
- advance_rune(t);
- kind = token.Arrow_Right;
- } else if t.ch == '-' && peek_byte(t) == '-' {
- advance_rune(t);
- advance_rune(t);
- kind = token.Undef;
- } else {
- kind = switch2(t, token.Sub, token.Sub_Eq);
- }
- case '*': kind = switch2(t, token.Mul, token.Mul_Eq);
- case '/':
- if t.ch == '/' || t.ch == '*' {
- kind = token.Comment;
- lit = scan_comment(t);
- } else {
- kind = switch2(t, token.Quo, token.Quo_Eq);
- }
- case '%': kind = switch4(t, token.Mod, token.Mod_Eq, '%', token.Mod_Mod, token.Mod_Mod_Eq);
- case '&':
- if t.ch == '~' {
- advance_rune(t);
- kind = switch2(t, token.And_Not, token.And_Not_Eq);
- } else {
- kind = switch3(t, token.And, token.And_Eq, '&', token.Cmp_And);
- }
- case '|': kind = switch3(t, token.Or, token.Or_Eq, '|', token.Cmp_Or);
- case '~': kind = token.Xor;
- case '<':
- if t.ch == '-' {
- advance_rune(t);
- kind = token.Arrow_Left;
- } else {
- kind = switch4(t, token.Lt, token.Lt_Eq, '<', token.Shl, token.Shl_Eq);
- }
- case '>': kind = switch4(t, token.Gt, token.Gt_Eq, '>', token.Shr,token.Shr_Eq);
- case '≠': kind = token.Not_Eq;
- case '≤': kind = token.Lt_Eq;
- case '≥': kind = token.Gt_Eq;
- case '∈': kind = token.In;
- case '∉': kind = token.Notin;
- case '.':
- if '0' <= t.ch && t.ch <= '9' {
- kind, lit = scan_number(t, true);
- } else {
- kind = token.Period;
- if t.ch == '.' {
- advance_rune(t);
- kind = token.Ellipsis;
- if t.ch == '<' {
- advance_rune(t);
- kind = token.Range_Half;
- }
- }
- }
- case ':': kind = token.Colon;
- case ',': kind = token.Comma;
- case ';': kind = token.Semicolon;
- case '(': kind = token.Open_Paren;
- case ')': kind = token.Close_Paren;
- case '[': kind = token.Open_Bracket;
- case ']': kind = token.Close_Bracket;
- case '{': kind = token.Open_Brace;
- case '}': kind = token.Close_Brace;
- case '\\': kind = token.Back_Slash;
- case:
- if ch != utf8.RUNE_BOM {
- error(t, t.offset, "illegal character '%r': %d", ch, ch);
- }
- kind = token.Invalid;
- }
- }
- if lit == "" {
- lit = string(t.src[offset : t.offset]);
- }
- return token.Token{kind, lit, pos};
- }
|