123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619 |
- package odin_tokenizer
- import "core:fmt"
- import "core:unicode/utf8"
- Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any);
- Tokenizer :: struct {
- // Immutable data
- path: string,
- src: []byte,
- err: Error_Handler,
- // Tokenizing state
- ch: rune,
- offset: int,
- read_offset: int,
- line_offset: int,
- line_count: int,
- // Mutable data
- error_count: int,
- }
- init :: proc(t: ^Tokenizer, src: []byte, path: string, err: Error_Handler = default_error_handler) {
- t.src = src;
- t.err = err;
- t.ch = ' ';
- t.offset = 0;
- t.read_offset = 0;
- t.line_offset = 0;
- t.line_count = len(src) > 0 ? 1 : 0;
- t.error_count = 0;
- t.path = path;
- advance_rune(t);
- if t.ch == utf8.RUNE_BOM {
- advance_rune(t);
- }
- }
- @(private)
- offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
- line := t.line_count;
- column := offset - t.line_offset + 1;
- return Pos {
- file = t.path,
- offset = offset,
- line = line,
- column = column,
- };
- }
- default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
- fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column);
- fmt.eprintf(msg, ..args);
- fmt.eprintf("\n");
- }
- error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
- pos := offset_to_pos(t, offset);
- if t.err != nil {
- t.err(pos, msg, ..args);
- }
- t.error_count += 1;
- }
- advance_rune :: proc(using t: ^Tokenizer) {
- if read_offset < len(src) {
- offset = read_offset;
- if ch == '\n' {
- line_offset = offset;
- line_count += 1;
- }
- r, w := rune(src[read_offset]), 1;
- switch {
- case r == 0:
- error(t, t.offset, "illegal character NUL");
- case r >= utf8.RUNE_SELF:
- r, w = utf8.decode_rune(src[read_offset:]);
- if r == utf8.RUNE_ERROR && w == 1 {
- error(t, t.offset, "illegal UTF-8 encoding");
- } else if r == utf8.RUNE_BOM && offset > 0 {
- error(t, t.offset, "illegal byte order mark");
- }
- }
- read_offset += w;
- ch = r;
- } else {
- offset = len(src);
- if ch == '\n' {
- line_offset = offset;
- line_count += 1;
- }
- ch = -1;
- }
- }
- peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
- if t.read_offset+offset < len(t.src) {
- return t.src[t.read_offset+offset];
- }
- return 0;
- }
- skip_whitespace :: proc(t: ^Tokenizer) {
- for t.ch == ' ' ||
- t.ch == '\t' ||
- t.ch == '\n' ||
- t.ch == '\r' {
- advance_rune(t);
- }
- }
- is_letter :: proc(r: rune) -> bool {
- if r < utf8.RUNE_SELF {
- switch r {
- case '_':
- return true;
- case 'A'..'Z', 'a'..'z':
- return true;
- }
- }
- // TODO(bill): Add unicode lookup tables
- return false;
- }
- is_digit :: proc(r: rune) -> bool {
- // TODO(bill): Add unicode lookup tables
- return '0' <= r && r <= '9';
- }
- scan_comment :: proc(t: ^Tokenizer) -> string {
- offset := t.offset-1;
- next := -1;
- general: {
- if t.ch == '/' || t.ch == '!' { // // #! comments
- advance_rune(t);
- for t.ch != '\n' && t.ch >= 0 {
- advance_rune(t);
- }
- next = t.offset;
- if t.ch == '\n' {
- next += 1;
- }
- break general;
- }
- /* style comment */
- advance_rune(t);
- for t.ch >= 0 {
- ch := t.ch;
- advance_rune(t);
- if ch == '*' && t.ch == '/' {
- advance_rune(t);
- next = t.offset;
- break general;
- }
- }
- error(t, offset, "comment not terminated");
- }
- lit := t.src[offset : t.offset];
- // NOTE(bill): Strip CR for line comments
- for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
- lit = lit[:len(lit)-1];
- }
- return string(lit);
- }
- scan_identifier :: proc(t: ^Tokenizer) -> string {
- offset := t.offset;
- for is_letter(t.ch) || is_digit(t.ch) {
- advance_rune(t);
- }
- return string(t.src[offset : t.offset]);
- }
- scan_string :: proc(t: ^Tokenizer) -> string {
- offset := t.offset-1;
- for {
- ch := t.ch;
- if ch == '\n' || ch < 0 {
- error(t, offset, "string literal was not terminated");
- break;
- }
- advance_rune(t);
- if ch == '"' {
- break;
- }
- if ch == '\\' {
- scan_escape(t);
- }
- }
- return string(t.src[offset : t.offset]);
- }
- scan_raw_string :: proc(t: ^Tokenizer) -> string {
- offset := t.offset-1;
- for {
- ch := t.ch;
- if ch == '\n' || ch < 0 {
- error(t, offset, "raw string literal was not terminated");
- break;
- }
- advance_rune(t);
- if ch == '`' {
- break;
- }
- }
- return string(t.src[offset : t.offset]);
- }
- digit_val :: proc(r: rune) -> int {
- switch r {
- case '0'..'9':
- return int(r-'0');
- case 'A'..'F':
- return int(r-'A' + 10);
- case 'a'..'f':
- return int(r-'a' + 10);
- }
- return 16;
- }
- scan_escape :: proc(t: ^Tokenizer) -> bool {
- offset := t.offset;
- n: int;
- base, max: u32;
- switch t.ch {
- case 'a', 'b', 'e', 'f', 'n', 't', 'v', '\\', '\'', '\"':
- advance_rune(t);
- return true;
- case '0'..'7':
- n, base, max = 3, 8, 255;
- case 'x':
- advance_rune(t);
- n, base, max = 2, 16, 255;
- case 'u':
- advance_rune(t);
- n, base, max = 4, 16, utf8.MAX_RUNE;
- case 'U':
- advance_rune(t);
- n, base, max = 8, 16, utf8.MAX_RUNE;
- case:
- if t.ch < 0 {
- error(t, offset, "escape sequence was not terminated");
- } else {
- error(t, offset, "unknown escape sequence");
- }
- return false;
- }
- x: u32;
- for n > 0 {
- d := u32(digit_val(t.ch));
- for d >= base {
- if t.ch < 0 {
- error(t, t.offset, "escape sequence was not terminated");
- } else {
- error(t, t.offset, "illegal character %d in escape sequence", t.ch);
- }
- return false;
- }
- x = x*base + d;
- advance_rune(t);
- n -= 1;
- }
- if x > max || 0xd800 <= x && x <= 0xe000 {
- error(t, offset, "escape sequence is an invalid Unicode code point");
- return false;
- }
- return true;
- }
- scan_rune :: proc(t: ^Tokenizer) -> string {
- offset := t.offset-1;
- valid := true;
- n := 0;
- for {
- ch := t.ch;
- if ch == '\n' || ch < 0 {
- if valid {
- error(t, offset, "rune literal not terminated");
- valid = false;
- }
- break;
- }
- advance_rune(t);
- if ch == '\'' {
- break;
- }
- n += 1;
- if ch == '\\' {
- if !scan_escape(t) {
- valid = false;
- }
- }
- }
- if valid && n != 1 {
- error(t, offset, "illegal rune literal");
- }
- return string(t.src[offset : t.offset]);
- }
- scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Token_Kind, string) {
- scan_mantissa :: proc(t: ^Tokenizer, base: int) {
- for digit_val(t.ch) < base || t.ch == '_' {
- advance_rune(t);
- }
- }
- scan_exponent :: proc(t: ^Tokenizer, kind: ^Token_Kind) {
- if t.ch == 'e' || t.ch == 'E' {
- kind^ = .Float;
- advance_rune(t);
- if t.ch == '-' || t.ch == '+' {
- advance_rune(t);
- }
- if digit_val(t.ch) < 10 {
- scan_mantissa(t, 10);
- } else {
- error(t, t.offset, "illegal floating-point exponent");
- }
- }
- // NOTE(bill): This needs to be here for sanity's sake
- switch t.ch {
- case 'i', 'j', 'k':
- kind^ = .Imag;
- advance_rune(t);
- }
- }
- scan_fraction :: proc(t: ^Tokenizer, kind: ^Token_Kind) -> (early_exit: bool) {
- if t.ch == '.' && peek_byte(t) == '.' {
- return true;
- }
- if t.ch == '.' {
- kind^ = .Float;
- advance_rune(t);
- scan_mantissa(t, 10);
- }
- return false;
- }
- offset := t.offset;
- kind := Token_Kind.Integer;
- seen_point := seen_decimal_point;
- if seen_point {
- offset -= 1;
- kind = .Float;
- scan_mantissa(t, 10);
- scan_exponent(t, &kind);
- } else {
- if t.ch == '0' {
- int_base :: inline proc(t: ^Tokenizer, kind: ^Token_Kind, base: int, msg: string) {
- prev := t.offset;
- advance_rune(t);
- scan_mantissa(t, base);
- if t.offset - prev <= 1 {
- kind^ = .Invalid;
- error(t, t.offset, msg);
- }
- }
- advance_rune(t);
- switch t.ch {
- case 'b': int_base(t, &kind, 2, "illegal binary integer");
- case 'o': int_base(t, &kind, 8, "illegal octal integer");
- case 'd': int_base(t, &kind, 10, "illegal decimal integer");
- case 'z': int_base(t, &kind, 12, "illegal dozenal integer");
- case 'x': int_base(t, &kind, 16, "illegal hexadecimal integer");
- case 'h':
- prev := t.offset;
- advance_rune(t);
- scan_mantissa(t, 16);
- if t.offset - prev <= 1 {
- kind = .Invalid;
- error(t, t.offset, "illegal hexadecimal floating-point number");
- } else {
- sub := t.src[prev+1 : t.offset];
- digit_count := 0;
- for d in sub {
- if d != '_' {
- digit_count += 1;
- }
- }
- switch digit_count {
- case 8, 16: break;
- case:
- error(t, t.offset, "invalid hexadecimal floating-point number, expected 8 or 16 digits, got %d", digit_count);
- }
- }
- case:
- seen_point = false;
- scan_mantissa(t, 10);
- if t.ch == '.' {
- seen_point = true;
- if scan_fraction(t, &kind) {
- return kind, string(t.src[offset : t.offset]);
- }
- }
- scan_exponent(t, &kind);
- return kind, string(t.src[offset : t.offset]);
- }
- }
- }
- scan_mantissa(t, 10);
- if scan_fraction(t, &kind) {
- return kind, string(t.src[offset : t.offset]);
- }
- scan_exponent(t, &kind);
- return kind, string(t.src[offset : t.offset]);
- }
- scan :: proc(t: ^Tokenizer) -> Token {
- switch2 :: proc(t: ^Tokenizer, tok0, tok1: Token_Kind) -> Token_Kind {
- if t.ch == '=' {
- advance_rune(t);
- return tok1;
- }
- return tok0;
- }
- switch3 :: proc(t: ^Tokenizer, tok0, tok1: Token_Kind, ch2: rune, tok2: Token_Kind) -> Token_Kind {
- if t.ch == '=' {
- advance_rune(t);
- return tok1;
- }
- if t.ch == ch2 {
- advance_rune(t);
- return tok2;
- }
- return tok0;
- }
- switch4 :: proc(t: ^Tokenizer, tok0, tok1: Token_Kind, ch2: rune, tok2, tok3: Token_Kind) -> Token_Kind {
- if t.ch == '=' {
- advance_rune(t);
- return tok1;
- }
- if t.ch == ch2 {
- advance_rune(t);
- if t.ch == '=' {
- advance_rune(t);
- return tok3;
- }
- return tok2;
- }
- return tok0;
- }
- skip_whitespace(t);
- offset := t.offset;
- kind: Token_Kind;
- lit: string;
- pos := offset_to_pos(t, offset);
- switch ch := t.ch; true {
- case is_letter(ch):
- lit = scan_identifier(t);
- kind = .Ident;
- check_keyword: if len(lit) > 1 {
- // TODO(bill): Maybe have a hash table lookup rather than this linear search
- for i in Token_Kind.B_Keyword_Begin .. Token_Kind.B_Keyword_End {
- if lit == tokens[i] {
- kind = Token_Kind(i);
- break check_keyword;
- }
- }
- for keyword, i in custom_keyword_tokens {
- if lit == keyword {
- kind = Token_Kind(i+1) + .B_Custom_Keyword_Begin;
- break check_keyword;
- }
- }
- if kind == .Ident && lit == "notin" {
- kind = .Not_In;
- }
- }
- case '0' <= ch && ch <= '9':
- kind, lit = scan_number(t, false);
- case:
- advance_rune(t);
- switch ch {
- case -1:
- kind = .EOF;
- case '"':
- kind = .String;
- lit = scan_string(t);
- case '\'':
- kind = .Rune;
- lit = scan_rune(t);
- case '`':
- kind = .String;
- lit = scan_raw_string(t);
- case '=':
- if t.ch == '>' {
- advance_rune(t);
- kind = .Double_Arrow_Right;
- } else {
- kind = switch2(t, .Eq, .Cmp_Eq);
- }
- case '!': kind = switch2(t, .Not, .Not_Eq);
- case '#':
- kind = .Hash;
- if t.ch == '!' {
- kind = .Comment;
- lit = scan_comment(t);
- }
- case '?': kind = .Question;
- case '@': kind = .At;
- case '$': kind = .Dollar;
- case '^': kind = .Pointer;
- case '+': kind = switch2(t, .Add, .Add_Eq);
- case '-':
- if t.ch == '>' {
- advance_rune(t);
- kind = .Arrow_Right;
- } else if t.ch == '-' && peek_byte(t) == '-' {
- advance_rune(t);
- advance_rune(t);
- kind = .Undef;
- } else {
- kind = switch2(t, .Sub, .Sub_Eq);
- }
- case '*': kind = switch2(t, .Mul, .Mul_Eq);
- case '/':
- if t.ch == '/' || t.ch == '*' {
- kind = .Comment;
- lit = scan_comment(t);
- } else {
- kind = switch2(t, .Quo, .Quo_Eq);
- }
- case '%': kind = switch4(t, .Mod, .Mod_Eq, '%', .Mod_Mod, .Mod_Mod_Eq);
- case '&':
- if t.ch == '~' {
- advance_rune(t);
- kind = switch2(t, .And_Not, .And_Not_Eq);
- } else {
- kind = switch3(t, .And, .And_Eq, '&', .Cmp_And);
- }
- case '|': kind = switch3(t, .Or, .Or_Eq, '|', .Cmp_Or);
- case '~': kind = .Xor;
- case '<':
- if t.ch == '-' {
- advance_rune(t);
- kind = .Arrow_Left;
- } else {
- kind = switch4(t, .Lt, .Lt_Eq, '<', .Shl, .Shl_Eq);
- }
- case '>': kind = switch4(t, .Gt, .Gt_Eq, '>', .Shr,.Shr_Eq);
- case '.':
- if '0' <= t.ch && t.ch <= '9' {
- kind, lit = scan_number(t, true);
- } else {
- kind = .Period;
- if t.ch == '.' {
- advance_rune(t);
- kind = .Ellipsis;
- if t.ch == '<' {
- advance_rune(t);
- kind = .Range_Half;
- }
- }
- }
- case ':': kind = .Colon;
- case ',': kind = .Comma;
- case ';': kind = .Semicolon;
- case '(': kind = .Open_Paren;
- case ')': kind = .Close_Paren;
- case '[': kind = .Open_Bracket;
- case ']': kind = .Close_Bracket;
- case '{': kind = .Open_Brace;
- case '}': kind = .Close_Brace;
- case '\\': kind = .Back_Slash;
- case:
- if ch != utf8.RUNE_BOM {
- error(t, t.offset, "illegal character '%r': %d", ch, ch);
- }
- kind = .Invalid;
- }
- }
- if lit == "" {
- lit = string(t.src[offset : t.offset]);
- }
- return Token{kind, lit, pos};
- }
|