123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741 |
- package odin_tokenizer
- import "core:fmt"
- import "core:unicode"
- import "core:unicode/utf8"
- Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
- Flag :: enum {
- Insert_Semicolon,
- }
- Flags :: distinct bit_set[Flag; u32]
- Tokenizer :: struct {
- // Immutable data
- path: string,
- src: string,
- err: Error_Handler,
- flags: Flags,
- // Tokenizing state
- ch: rune,
- offset: int,
- read_offset: int,
- line_offset: int,
- line_count: int,
- insert_semicolon: bool,
- // Mutable data
- error_count: int,
- }
- init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
- t.src = src
- t.err = err
- t.ch = ' '
- t.offset = 0
- t.read_offset = 0
- t.line_offset = 0
- t.line_count = len(src) > 0 ? 1 : 0
- t.insert_semicolon = false
- t.error_count = 0
- t.path = path
- advance_rune(t)
- if t.ch == utf8.RUNE_BOM {
- advance_rune(t)
- }
- }
- @(private)
- offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
- line := t.line_count
- column := offset - t.line_offset + 1
- return Pos {
- file = t.path,
- offset = offset,
- line = line,
- column = column,
- }
- }
- default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
- fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
- fmt.eprintf(msg, ..args)
- fmt.eprintf("\n")
- }
- error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
- pos := offset_to_pos(t, offset)
- if t.err != nil {
- t.err(pos, msg, ..args)
- }
- t.error_count += 1
- }
- advance_rune :: proc(t: ^Tokenizer) {
- if t.read_offset < len(t.src) {
- t.offset = t.read_offset
- if t.ch == '\n' {
- t.line_offset = t.offset
- t.line_count += 1
- }
- r, w := rune(t.src[t.read_offset]), 1
- switch {
- case r == 0:
- error(t, t.offset, "illegal character NUL")
- case r >= utf8.RUNE_SELF:
- r, w = utf8.decode_rune_in_string(t.src[t.read_offset:])
- if r == utf8.RUNE_ERROR && w == 1 {
- error(t, t.offset, "illegal UTF-8 encoding")
- } else if r == utf8.RUNE_BOM && t.offset > 0 {
- error(t, t.offset, "illegal byte order mark")
- }
- }
- t.read_offset += w
- t.ch = r
- } else {
- t.offset = len(t.src)
- if t.ch == '\n' {
- t.line_offset = t.offset
- t.line_count += 1
- }
- t.ch = -1
- }
- }
- peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
- if t.read_offset+offset < len(t.src) {
- return t.src[t.read_offset+offset]
- }
- return 0
- }
- skip_whitespace :: proc(t: ^Tokenizer) {
- if t.insert_semicolon {
- for {
- switch t.ch {
- case ' ', '\t', '\r':
- advance_rune(t)
- case:
- return
- }
- }
- } else {
- for {
- switch t.ch {
- case ' ', '\t', '\r', '\n':
- advance_rune(t)
- case:
- return
- }
- }
- }
- }
- is_letter :: proc(r: rune) -> bool {
- if r < utf8.RUNE_SELF {
- switch r {
- case '_':
- return true
- case 'A'..='Z', 'a'..='z':
- return true
- }
- }
- return unicode.is_letter(r)
- }
- is_digit :: proc(r: rune) -> bool {
- if '0' <= r && r <= '9' {
- return true
- }
- return unicode.is_digit(r)
- }
- scan_comment :: proc(t: ^Tokenizer) -> string {
- offset := t.offset-1
- next := -1
- general: {
- if t.ch == '/' || t.ch == '!' { // // #! comments
- advance_rune(t)
- for t.ch != '\n' && t.ch >= 0 {
- advance_rune(t)
- }
- next = t.offset
- if t.ch == '\n' {
- next += 1
- }
- break general
- }
- /* style comment */
- advance_rune(t)
- nest := 1
- for t.ch >= 0 && nest > 0 {
- ch := t.ch
- advance_rune(t)
- if ch == '/' && t.ch == '*' {
- nest += 1
- }
- if ch == '*' && t.ch == '/' {
- nest -= 1
- advance_rune(t)
- next = t.offset
- if nest == 0 {
- break general
- }
- }
- }
- error(t, offset, "comment not terminated")
- }
- lit := t.src[offset : t.offset]
- // NOTE(bill): Strip CR for line comments
- for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
- lit = lit[:len(lit)-1]
- }
- return string(lit)
- }
- scan_identifier :: proc(t: ^Tokenizer) -> string {
- offset := t.offset
- for is_letter(t.ch) || is_digit(t.ch) {
- advance_rune(t)
- }
- return string(t.src[offset : t.offset])
- }
- scan_string :: proc(t: ^Tokenizer) -> string {
- offset := t.offset-1
- for {
- ch := t.ch
- if ch == '\n' || ch < 0 {
- error(t, offset, "string literal was not terminated")
- break
- }
- advance_rune(t)
- if ch == '"' {
- break
- }
- if ch == '\\' {
- scan_escape(t)
- }
- }
- return string(t.src[offset : t.offset])
- }
- scan_raw_string :: proc(t: ^Tokenizer) -> string {
- offset := t.offset-1
- for {
- ch := t.ch
- if ch == utf8.RUNE_EOF {
- error(t, offset, "raw string literal was not terminated")
- break
- }
- advance_rune(t)
- if ch == '`' {
- break
- }
- }
- return string(t.src[offset : t.offset])
- }
- digit_val :: proc(r: rune) -> int {
- switch r {
- case '0'..='9':
- return int(r-'0')
- case 'A'..='F':
- return int(r-'A' + 10)
- case 'a'..='f':
- return int(r-'a' + 10)
- }
- return 16
- }
- scan_escape :: proc(t: ^Tokenizer) -> bool {
- offset := t.offset
- n: int
- base, max: u32
- switch t.ch {
- case 'a', 'b', 'e', 'f', 'n', 't', 'v', 'r', '\\', '\'', '\"':
- advance_rune(t)
- return true
- case '0'..='7':
- n, base, max = 3, 8, 255
- case 'x':
- advance_rune(t)
- n, base, max = 2, 16, 255
- case 'u':
- advance_rune(t)
- n, base, max = 4, 16, utf8.MAX_RUNE
- case 'U':
- advance_rune(t)
- n, base, max = 8, 16, utf8.MAX_RUNE
- case:
- if t.ch < 0 {
- error(t, offset, "escape sequence was not terminated")
- } else {
- error(t, offset, "unknown escape sequence")
- }
- return false
- }
- x: u32
- for n > 0 {
- d := u32(digit_val(t.ch))
- for d >= base {
- if t.ch < 0 {
- error(t, t.offset, "escape sequence was not terminated")
- } else {
- error(t, t.offset, "illegal character %d in escape sequence", t.ch)
- }
- return false
- }
- x = x*base + d
- advance_rune(t)
- n -= 1
- }
- if x > max || 0xd800 <= x && x <= 0xe000 {
- error(t, offset, "escape sequence is an invalid Unicode code point")
- return false
- }
- return true
- }
- scan_rune :: proc(t: ^Tokenizer) -> string {
- offset := t.offset-1
- valid := true
- n := 0
- for {
- ch := t.ch
- if ch == '\n' || ch < 0 {
- if valid {
- error(t, offset, "rune literal not terminated")
- valid = false
- }
- break
- }
- advance_rune(t)
- if ch == '\'' {
- break
- }
- n += 1
- if ch == '\\' {
- if !scan_escape(t) {
- valid = false
- }
- }
- }
- if valid && n != 1 {
- error(t, offset, "illegal rune literal")
- }
- return string(t.src[offset : t.offset])
- }
- scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Token_Kind, string) {
- scan_mantissa :: proc(t: ^Tokenizer, base: int) {
- for digit_val(t.ch) < base || t.ch == '_' {
- advance_rune(t)
- }
- }
- scan_exponent :: proc(t: ^Tokenizer, kind: ^Token_Kind) {
- if t.ch == 'e' || t.ch == 'E' {
- kind^ = .Float
- advance_rune(t)
- if t.ch == '-' || t.ch == '+' {
- advance_rune(t)
- }
- if digit_val(t.ch) < 10 {
- scan_mantissa(t, 10)
- } else {
- error(t, t.offset, "illegal floating-point exponent")
- }
- }
- // NOTE(bill): This needs to be here for sanity's sake
- switch t.ch {
- case 'i', 'j', 'k':
- kind^ = .Imag
- advance_rune(t)
- }
- }
- scan_fraction :: proc(t: ^Tokenizer, kind: ^Token_Kind) -> (early_exit: bool) {
- if t.ch == '.' && peek_byte(t) == '.' {
- return true
- }
- if t.ch == '.' {
- kind^ = .Float
- advance_rune(t)
- scan_mantissa(t, 10)
- }
- return false
- }
- offset := t.offset
- kind := Token_Kind.Integer
- seen_point := seen_decimal_point
- if seen_point {
- offset -= 1
- kind = .Float
- scan_mantissa(t, 10)
- scan_exponent(t, &kind)
- } else {
- if t.ch == '0' {
- int_base :: proc(t: ^Tokenizer, kind: ^Token_Kind, base: int, msg: string) {
- prev := t.offset
- advance_rune(t)
- scan_mantissa(t, base)
- if t.offset - prev <= 1 {
- kind^ = .Invalid
- error(t, t.offset, msg)
- }
- }
- advance_rune(t)
- switch t.ch {
- case 'b': int_base(t, &kind, 2, "illegal binary integer")
- case 'o': int_base(t, &kind, 8, "illegal octal integer")
- case 'd': int_base(t, &kind, 10, "illegal decimal integer")
- case 'z': int_base(t, &kind, 12, "illegal dozenal integer")
- case 'x': int_base(t, &kind, 16, "illegal hexadecimal integer")
- case 'h':
- prev := t.offset
- advance_rune(t)
- scan_mantissa(t, 16)
- if t.offset - prev <= 1 {
- kind = .Invalid
- error(t, t.offset, "illegal hexadecimal floating-point number")
- } else {
- sub := t.src[prev+1 : t.offset]
- digit_count := 0
- for d in sub {
- if d != '_' {
- digit_count += 1
- }
- }
- switch digit_count {
- case 4, 8, 16: break
- case:
- error(t, t.offset, "invalid hexadecimal floating-point number, expected 4, 8, or 16 digits, got %d", digit_count)
- }
- }
- case:
- seen_point = false
- scan_mantissa(t, 10)
- if t.ch == '.' {
- seen_point = true
- if scan_fraction(t, &kind) {
- return kind, string(t.src[offset : t.offset])
- }
- }
- scan_exponent(t, &kind)
- return kind, string(t.src[offset : t.offset])
- }
- }
- }
- scan_mantissa(t, 10)
- if scan_fraction(t, &kind) {
- return kind, string(t.src[offset : t.offset])
- }
- scan_exponent(t, &kind)
- return kind, string(t.src[offset : t.offset])
- }
- scan :: proc(t: ^Tokenizer) -> Token {
- skip_whitespace(t)
- offset := t.offset
- kind: Token_Kind
- lit: string
- pos := offset_to_pos(t, offset)
- switch ch := t.ch; true {
- case is_letter(ch):
- lit = scan_identifier(t)
- kind = .Ident
- check_keyword: if len(lit) > 1 {
- // TODO(bill): Maybe have a hash table lookup rather than this linear search
- for i in Token_Kind.B_Keyword_Begin ..= Token_Kind.B_Keyword_End {
- if lit == tokens[i] {
- kind = Token_Kind(i)
- break check_keyword
- }
- }
- for keyword, i in custom_keyword_tokens {
- if lit == keyword {
- kind = Token_Kind(i+1) + .B_Custom_Keyword_Begin
- break check_keyword
- }
- }
- break check_keyword
- }
- case '0' <= ch && ch <= '9':
- kind, lit = scan_number(t, false)
- case:
- advance_rune(t)
- switch ch {
- case -1:
- kind = .EOF
- if t.insert_semicolon {
- t.insert_semicolon = false
- kind = .Semicolon
- lit = "\n"
- return Token{kind, lit, pos}
- }
- case '\n':
- t.insert_semicolon = false
- kind = .Semicolon
- lit = "\n"
- case '\\':
- if .Insert_Semicolon in t.flags {
- t.insert_semicolon = false
- }
- token := scan(t)
- if token.pos.line == pos.line {
- error(t, token.pos.offset, "expected a newline after \\")
- }
- return token
- case '\'':
- kind = .Rune
- lit = scan_rune(t)
- case '"':
- kind = .String
- lit = scan_string(t)
- case '`':
- kind = .String
- lit = scan_raw_string(t)
- case '.':
- kind = .Period
- switch t.ch {
- case '0'..='9':
- kind, lit = scan_number(t, true)
- case '.':
- advance_rune(t)
- kind = .Ellipsis
- switch t.ch {
- case '<':
- advance_rune(t)
- kind = .Range_Half
- case '=':
- advance_rune(t)
- kind = .Range_Full
- }
- }
- case '@': kind = .At
- case '$': kind = .Dollar
- case '?': kind = .Question
- case '^': kind = .Pointer
- case ';': kind = .Semicolon
- case ',': kind = .Comma
- case ':': kind = .Colon
- case '(': kind = .Open_Paren
- case ')': kind = .Close_Paren
- case '[': kind = .Open_Bracket
- case ']': kind = .Close_Bracket
- case '{': kind = .Open_Brace
- case '}': kind = .Close_Brace
- case '%':
- kind = .Mod
- switch t.ch {
- case '=':
- advance_rune(t)
- kind = .Mod_Eq
- case '%':
- advance_rune(t)
- kind = .Mod_Mod
- if t.ch == '=' {
- advance_rune(t)
- kind = .Mod_Mod_Eq
- }
- }
- case '*':
- kind = .Mul
- if t.ch == '=' {
- advance_rune(t)
- kind = .Mul_Eq
- }
- case '=':
- kind = .Eq
- if t.ch == '=' {
- advance_rune(t)
- kind = .Cmp_Eq
- }
- case '~':
- kind = .Xor
- if t.ch == '=' {
- advance_rune(t)
- kind = .Xor_Eq
- }
- case '!':
- kind = .Not
- if t.ch == '=' {
- advance_rune(t)
- kind = .Not_Eq
- }
- case '+':
- kind = .Add
- switch t.ch {
- case '=':
- advance_rune(t)
- kind = .Add_Eq
- case '+':
- advance_rune(t)
- kind = .Increment
- }
- case '-':
- kind = .Sub
- switch t.ch {
- case '-':
- advance_rune(t)
- kind = .Decrement
- if t.ch == '-' {
- advance_rune(t)
- kind = .Undef
- }
- case '>':
- advance_rune(t)
- kind = .Arrow_Right
- case '=':
- advance_rune(t)
- kind = .Sub_Eq
- }
- case '#':
- kind = .Hash
- if t.ch == '!' {
- kind = .Comment
- lit = scan_comment(t)
- }
- case '/':
- kind = .Quo
- switch t.ch {
- case '/', '*':
- kind = .Comment
- lit = scan_comment(t)
- case '=':
- advance_rune(t)
- kind = .Quo_Eq
- }
- case '<':
- kind = .Lt
- switch t.ch {
- case '=':
- advance_rune(t)
- kind = .Lt_Eq
- case '<':
- advance_rune(t)
- kind = .Shl
- if t.ch == '=' {
- advance_rune(t)
- kind = .Shl_Eq
- }
- }
- case '>':
- kind = .Gt
- switch t.ch {
- case '=':
- advance_rune(t)
- kind = .Gt_Eq
- case '>':
- advance_rune(t)
- kind = .Shr
- if t.ch == '=' {
- advance_rune(t)
- kind = .Shr_Eq
- }
- }
- case '&':
- kind = .And
- switch t.ch {
- case '~':
- advance_rune(t)
- kind = .And_Not
- if t.ch == '=' {
- advance_rune(t)
- kind = .And_Not_Eq
- }
- case '=':
- advance_rune(t)
- kind = .And_Eq
- case '&':
- advance_rune(t)
- kind = .Cmp_And
- if t.ch == '=' {
- advance_rune(t)
- kind = .Cmp_And_Eq
- }
- }
- case '|':
- kind = .Or
- switch t.ch {
- case '=':
- advance_rune(t)
- kind = .Or_Eq
- case '|':
- advance_rune(t)
- kind = .Cmp_Or
- if t.ch == '=' {
- advance_rune(t)
- kind = .Cmp_Or_Eq
- }
- }
- case:
- if ch != utf8.RUNE_BOM {
- error(t, t.offset, "illegal character '%r': %d", ch, ch)
- }
- kind = .Invalid
- }
- }
- if .Insert_Semicolon in t.flags {
- #partial switch kind {
- case .Invalid, .Comment:
- // Preserve insert_semicolon info
- case .Ident, .Context, .Typeid, .Break, .Continue, .Fallthrough, .Return,
- .Integer, .Float, .Imag, .Rune, .String, .Undef,
- .Question, .Pointer, .Close_Paren, .Close_Bracket, .Close_Brace,
- .Increment, .Decrement, .Or_Return, .Or_Break, .Or_Continue:
- /*fallthrough*/
- t.insert_semicolon = true
- case:
- t.insert_semicolon = false
- break
- }
- }
- if lit == "" {
- lit = string(t.src[offset : t.offset])
- }
- return Token{kind, lit, pos}
- }
|