#define TOKEN_KINDS \ TOKEN_KIND(Token_Invalid, "Invalid"), \ TOKEN_KIND(Token_EOF, "EOF"), \ TOKEN_KIND(Token_Comment, "Comment"), \ \ TOKEN_KIND(Token__LiteralBegin, ""), \ TOKEN_KIND(Token_Ident, "identifier"), \ TOKEN_KIND(Token_Integer, "integer"), \ TOKEN_KIND(Token_Float, "float"), \ TOKEN_KIND(Token_Imag, "imaginary"), \ TOKEN_KIND(Token_Rune, "rune"), \ TOKEN_KIND(Token_String, "string"), \ TOKEN_KIND(Token__LiteralEnd, ""), \ \ TOKEN_KIND(Token__OperatorBegin, ""), \ TOKEN_KIND(Token_Eq, "="), \ TOKEN_KIND(Token_Not, "!"), \ TOKEN_KIND(Token_Hash, "#"), \ TOKEN_KIND(Token_At, "@"), \ TOKEN_KIND(Token_Dollar, "$"), \ TOKEN_KIND(Token_Pointer, "^"), \ TOKEN_KIND(Token_Question, "?"), \ TOKEN_KIND(Token_Add, "+"), \ TOKEN_KIND(Token_Sub, "-"), \ TOKEN_KIND(Token_Mul, "*"), \ TOKEN_KIND(Token_Quo, "/"), \ TOKEN_KIND(Token_Mod, "%"), \ TOKEN_KIND(Token_ModMod, "%%"), \ TOKEN_KIND(Token_And, "&"), \ TOKEN_KIND(Token_Or, "|"), \ TOKEN_KIND(Token_Xor, "~"), \ TOKEN_KIND(Token_AndNot, "&~"), \ TOKEN_KIND(Token_Shl, "<<"), \ TOKEN_KIND(Token_Shr, ">>"), \ TOKEN_KIND(Token_CmpAnd, "&&"), \ TOKEN_KIND(Token_CmpOr, "||"), \ \ TOKEN_KIND(Token__AssignOpBegin, ""), \ TOKEN_KIND(Token_AddEq, "+="), \ TOKEN_KIND(Token_SubEq, "-="), \ TOKEN_KIND(Token_MulEq, "*="), \ TOKEN_KIND(Token_QuoEq, "/="), \ TOKEN_KIND(Token_ModEq, "%="), \ TOKEN_KIND(Token_ModModEq, "%%="), \ TOKEN_KIND(Token_AndEq, "&="), \ TOKEN_KIND(Token_OrEq, "|="), \ TOKEN_KIND(Token_XorEq, "~="), \ TOKEN_KIND(Token_AndNotEq, "&~="), \ TOKEN_KIND(Token_ShlEq, "<<="), \ TOKEN_KIND(Token_ShrEq, ">>="), \ TOKEN_KIND(Token_CmpAndEq, "&&="), \ TOKEN_KIND(Token_CmpOrEq, "||="), \ TOKEN_KIND(Token__AssignOpEnd, ""), \ TOKEN_KIND(Token_Increment, "++"), \ TOKEN_KIND(Token_Decrement, "--"), \ TOKEN_KIND(Token_ArrowRight,"->"), \ TOKEN_KIND(Token_Undef, "---"), \ \ TOKEN_KIND(Token__ComparisonBegin, ""), \ TOKEN_KIND(Token_CmpEq, "=="), \ TOKEN_KIND(Token_NotEq, "!="), \ TOKEN_KIND(Token_Lt, "<"), \ TOKEN_KIND(Token_Gt, ">"), \ TOKEN_KIND(Token_LtEq, "<="), \ TOKEN_KIND(Token_GtEq, ">="), \ TOKEN_KIND(Token__ComparisonEnd, ""), \ \ TOKEN_KIND(Token_OpenParen, "("), \ TOKEN_KIND(Token_CloseParen, ")"), \ TOKEN_KIND(Token_OpenBracket, "["), \ TOKEN_KIND(Token_CloseBracket, "]"), \ TOKEN_KIND(Token_OpenBrace, "{"), \ TOKEN_KIND(Token_CloseBrace, "}"), \ TOKEN_KIND(Token_Colon, ":"), \ TOKEN_KIND(Token_Semicolon, ";"), \ TOKEN_KIND(Token_Period, "."), \ TOKEN_KIND(Token_Comma, ","), \ TOKEN_KIND(Token_Ellipsis, ".."), \ TOKEN_KIND(Token_RangeHalf, "..<"), \ TOKEN_KIND(Token_BackSlash, "\\"), \ TOKEN_KIND(Token__OperatorEnd, ""), \ \ TOKEN_KIND(Token__KeywordBegin, ""), \ TOKEN_KIND(Token_import, "import"), \ TOKEN_KIND(Token_foreign, "foreign"), \ TOKEN_KIND(Token_package, "package"), \ TOKEN_KIND(Token_typeid, "typeid"), \ TOKEN_KIND(Token_when, "when"), \ TOKEN_KIND(Token_where, "where"), \ TOKEN_KIND(Token_if, "if"), \ TOKEN_KIND(Token_else, "else"), \ TOKEN_KIND(Token_for, "for"), \ TOKEN_KIND(Token_switch, "switch"), \ TOKEN_KIND(Token_in, "in"), \ TOKEN_KIND(Token_not_in, "not_in"), \ TOKEN_KIND(Token_do, "do"), \ TOKEN_KIND(Token_case, "case"), \ TOKEN_KIND(Token_break, "break"), \ TOKEN_KIND(Token_continue, "continue"), \ TOKEN_KIND(Token_fallthrough, "fallthrough"), \ TOKEN_KIND(Token_defer, "defer"), \ TOKEN_KIND(Token_return, "return"), \ TOKEN_KIND(Token_proc, "proc"), \ TOKEN_KIND(Token_struct, "struct"), \ TOKEN_KIND(Token_union, "union"), \ TOKEN_KIND(Token_enum, "enum"), \ TOKEN_KIND(Token_bit_set, "bit_set"), \ TOKEN_KIND(Token_map, "map"), \ TOKEN_KIND(Token_dynamic, "dynamic"), \ TOKEN_KIND(Token_auto_cast, "auto_cast"), \ TOKEN_KIND(Token_cast, "cast"), \ TOKEN_KIND(Token_transmute, "transmute"), \ TOKEN_KIND(Token_distinct, "distinct"), \ TOKEN_KIND(Token_using, "using"), \ TOKEN_KIND(Token_inline, "inline"), \ TOKEN_KIND(Token_no_inline, "no_inline"), \ TOKEN_KIND(Token_context, "context"), \ TOKEN_KIND(Token_asm, "asm"), \ TOKEN_KIND(Token__KeywordEnd, ""), \ TOKEN_KIND(Token_Count, "") enum TokenKind { #define TOKEN_KIND(e, s) e TOKEN_KINDS #undef TOKEN_KIND }; String const token_strings[] = { #define TOKEN_KIND(e, s) {cast(u8 *)s, gb_size_of(s)-1} TOKEN_KINDS #undef TOKEN_KIND }; struct KeywordHashEntry { u32 hash; TokenKind kind; String text; }; enum { KEYWORD_HASH_TABLE_COUNT = 1<<9, KEYWORD_HASH_TABLE_MASK = KEYWORD_HASH_TABLE_COUNT-1, }; gb_global KeywordHashEntry keyword_hash_table[KEYWORD_HASH_TABLE_COUNT] = {}; GB_STATIC_ASSERT(Token__KeywordEnd-Token__KeywordBegin <= gb_count_of(keyword_hash_table)); gb_global isize const min_keyword_size = 2; gb_global isize max_keyword_size = 11; gb_global bool keyword_indices[16] = {}; gb_inline u32 keyword_hash(u8 const *text, isize len) { return fnv32a(text, len); // return murmur3_32(text, len, 0x6f64696e); } void add_keyword_hash_entry(String const &s, TokenKind kind) { max_keyword_size = gb_max(max_keyword_size, s.len); keyword_indices[s.len] = true; u32 hash = keyword_hash(s.text, s.len); // NOTE(bill): This is a bit of an empirical hack in order to speed things up u32 index = hash & KEYWORD_HASH_TABLE_MASK; KeywordHashEntry *entry = &keyword_hash_table[index]; GB_ASSERT_MSG(entry->kind == Token_Invalid, "Keyword hash table initialtion collision: %.*s %.*s %08x %08x", LIT(s), LIT(token_strings[entry->kind]), hash, entry->hash); entry->hash = hash; entry->kind = kind; entry->text = s; } void init_keyword_hash_table(void) { for (i32 kind = Token__KeywordBegin+1; kind < Token__KeywordEnd; kind++) { add_keyword_hash_entry(token_strings[kind], cast(TokenKind)kind); } static struct { String s; TokenKind kind; } const legacy_keywords[] = { {str_lit("notin"), Token_not_in}, }; for (i32 i = 0; i < gb_count_of(legacy_keywords); i++) { add_keyword_hash_entry(legacy_keywords[i].s, legacy_keywords[i].kind); } GB_ASSERT(max_keyword_size < 16); } gb_global Array global_file_path_strings; // index is file id String get_file_path_string(i32 index); struct TokenPos { i32 file_id; i32 offset; // starting at 0 i32 line; // starting at 1 i32 column; // starting at 1 }; // temporary char *token_pos_to_string(TokenPos const &pos) { gbString s = gb_string_make_reserve(temporary_allocator(), 128); String file = get_file_path_string(pos.file_id); s = gb_string_append_fmt(s, "%.*s(%d:%d)", LIT(file), pos.line, pos.column); return s; } i32 token_pos_cmp(TokenPos const &a, TokenPos const &b) { if (a.offset != b.offset) { return (a.offset < b.offset) ? -1 : +1; } if (a.line != b.line) { return (a.line < b.line) ? -1 : +1; } if (a.column != b.column) { return (a.column < b.column) ? -1 : +1; } return string_compare(get_file_path_string(a.file_id), get_file_path_string(b.file_id)); } bool operator==(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) == 0; } bool operator!=(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) != 0; } bool operator< (TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) < 0; } bool operator<=(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) <= 0; } bool operator> (TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) > 0; } bool operator>=(TokenPos const &a, TokenPos const &b) { return token_pos_cmp(a, b) >= 0; } TokenPos token_pos_add_column(TokenPos pos) { pos.column += 1; pos.offset += 1; return pos; } struct Token { TokenKind kind; String string; TokenPos pos; }; Token empty_token = {Token_Invalid}; Token blank_token = {Token_Ident, {cast(u8 *)"_", 1}}; Token make_token_ident(String s) { Token t = {Token_Ident, s}; return t; } Token make_token_ident(char const *s) { Token t = {Token_Ident, make_string_c(s)}; return t; } bool token_is_newline(Token const &tok) { return tok.kind == Token_Semicolon && tok.string == "\n"; } struct ErrorCollector { TokenPos prev; i64 count; i64 warning_count; bool in_block; gbMutex mutex; gbMutex string_mutex; Array error_buffer; Array errors; }; gb_global ErrorCollector global_error_collector; #define MAX_ERROR_COLLECTOR_COUNT (36) bool any_errors(void) { return global_error_collector.error_buffer.count > 0; } void init_global_error_collector(void) { gb_mutex_init(&global_error_collector.mutex); gb_mutex_init(&global_error_collector.string_mutex); array_init(&global_error_collector.errors, heap_allocator()); array_init(&global_error_collector.error_buffer, heap_allocator()); array_init(&global_file_path_strings, heap_allocator(), 4096); } bool set_file_path_string(i32 index, String const &path) { bool ok = false; GB_ASSERT(index >= 0); gb_mutex_lock(&global_error_collector.string_mutex); if (index >= global_file_path_strings.count) { array_resize(&global_file_path_strings, index); } String prev = global_file_path_strings[index]; if (prev.len == 0) { global_file_path_strings[index] = path; ok = true; } gb_mutex_unlock(&global_error_collector.string_mutex); return ok; } String get_file_path_string(i32 index) { GB_ASSERT(index >= 0); gb_mutex_lock(&global_error_collector.string_mutex); String path = {}; if (index < global_file_path_strings.count) { path = global_file_path_strings[index]; } gb_mutex_unlock(&global_error_collector.string_mutex); return path; } void begin_error_block(void) { gb_mutex_lock(&global_error_collector.mutex); global_error_collector.in_block = true; } void end_error_block(void) { if (global_error_collector.error_buffer.count > 0) { isize n = global_error_collector.error_buffer.count; u8 *text = gb_alloc_array(heap_allocator(), u8, n+1); gb_memmove(text, global_error_collector.error_buffer.data, n); text[n] = 0; String s = {text, n}; array_add(&global_error_collector.errors, s); global_error_collector.error_buffer.count = 0; // gbFile *f = gb_file_get_standard(gbFileStandard_Error); // gb_file_write(f, text, n); } global_error_collector.in_block = false; gb_mutex_unlock(&global_error_collector.mutex); } #define ERROR_OUT_PROC(name) void name(char const *fmt, va_list va) typedef ERROR_OUT_PROC(ErrorOutProc); ERROR_OUT_PROC(default_error_out_va) { gbFile *f = gb_file_get_standard(gbFileStandard_Error); char buf[4096] = {}; isize len = gb_snprintf_va(buf, gb_size_of(buf), fmt, va); isize n = len-1; if (global_error_collector.in_block) { isize cap = global_error_collector.error_buffer.count + n; array_reserve(&global_error_collector.error_buffer, cap); u8 *data = global_error_collector.error_buffer.data + global_error_collector.error_buffer.count; gb_memmove(data, buf, n); global_error_collector.error_buffer.count += n; } else { gb_mutex_lock(&global_error_collector.mutex); { u8 *text = gb_alloc_array(heap_allocator(), u8, n+1); gb_memmove(text, buf, n); text[n] = 0; array_add(&global_error_collector.errors, make_string(text, n)); } gb_mutex_unlock(&global_error_collector.mutex); } gb_file_write(f, buf, n); } ErrorOutProc *error_out_va = default_error_out_va; // NOTE: defined in build_settings.cpp bool global_warnings_as_errors(void); bool global_ignore_warnings(void); void error_out(char const *fmt, ...) { va_list va; va_start(va, fmt); error_out_va(fmt, va); va_end(va); } void error_va(Token token, char const *fmt, va_list va) { gb_mutex_lock(&global_error_collector.mutex); global_error_collector.count++; // NOTE(bill): Duplicate error, skip it if (token.pos.line == 0) { error_out("Error: %s\n", gb_bprintf_va(fmt, va)); } else if (global_error_collector.prev != token.pos) { global_error_collector.prev = token.pos; error_out("%s %s\n", token_pos_to_string(token.pos), gb_bprintf_va(fmt, va)); } gb_mutex_unlock(&global_error_collector.mutex); if (global_error_collector.count > MAX_ERROR_COLLECTOR_COUNT) { gb_exit(1); } } void warning_va(Token token, char const *fmt, va_list va) { if (global_warnings_as_errors()) { error_va(token, fmt, va); return; } gb_mutex_lock(&global_error_collector.mutex); global_error_collector.warning_count++; if (!global_ignore_warnings()) { // NOTE(bill): Duplicate error, skip it if (token.pos.line == 0) { error_out("Warning: %s\n", gb_bprintf_va(fmt, va)); } else if (global_error_collector.prev != token.pos) { global_error_collector.prev = token.pos; error_out("%s Warning: %s\n", token_pos_to_string(token.pos), gb_bprintf_va(fmt, va)); } } gb_mutex_unlock(&global_error_collector.mutex); } void error_line_va(char const *fmt, va_list va) { gb_mutex_lock(&global_error_collector.mutex); error_out_va(fmt, va); gb_mutex_unlock(&global_error_collector.mutex); } void error_no_newline_va(Token token, char const *fmt, va_list va) { gb_mutex_lock(&global_error_collector.mutex); global_error_collector.count++; // NOTE(bill): Duplicate error, skip it if (token.pos.line == 0) { error_out("Error: %s", gb_bprintf_va(fmt, va)); } else if (global_error_collector.prev != token.pos) { global_error_collector.prev = token.pos; error_out("%s %s", token_pos_to_string(token.pos), gb_bprintf_va(fmt, va)); } gb_mutex_unlock(&global_error_collector.mutex); if (global_error_collector.count > MAX_ERROR_COLLECTOR_COUNT) { gb_exit(1); } } void syntax_error_va(Token token, char const *fmt, va_list va) { gb_mutex_lock(&global_error_collector.mutex); global_error_collector.count++; // NOTE(bill): Duplicate error, skip it if (global_error_collector.prev != token.pos) { global_error_collector.prev = token.pos; error_out("%s Syntax Error: %s\n", token_pos_to_string(token.pos), gb_bprintf_va(fmt, va)); } else if (token.pos.line == 0) { error_out("Syntax Error: %s\n", gb_bprintf_va(fmt, va)); } gb_mutex_unlock(&global_error_collector.mutex); if (global_error_collector.count > MAX_ERROR_COLLECTOR_COUNT) { gb_exit(1); } } void syntax_warning_va(Token token, char const *fmt, va_list va) { if (global_warnings_as_errors()) { syntax_error_va(token, fmt, va); return; } gb_mutex_lock(&global_error_collector.mutex); global_error_collector.warning_count++; if (!global_ignore_warnings()) { // NOTE(bill): Duplicate error, skip it if (global_error_collector.prev != token.pos) { global_error_collector.prev = token.pos; error_out("%s Syntax Warning: %s\n", token_pos_to_string(token.pos), gb_bprintf_va(fmt, va)); } else if (token.pos.line == 0) { error_out("Warning: %s\n", gb_bprintf_va(fmt, va)); } } gb_mutex_unlock(&global_error_collector.mutex); } void warning(Token token, char const *fmt, ...) { va_list va; va_start(va, fmt); warning_va(token, fmt, va); va_end(va); } void error(Token token, char const *fmt, ...) { va_list va; va_start(va, fmt); error_va(token, fmt, va); va_end(va); } void error(TokenPos pos, char const *fmt, ...) { va_list va; va_start(va, fmt); Token token = {}; token.pos = pos; error_va(token, fmt, va); va_end(va); } void error_line(char const *fmt, ...) { va_list va; va_start(va, fmt); error_line_va(fmt, va); va_end(va); } void syntax_error(Token token, char const *fmt, ...) { va_list va; va_start(va, fmt); syntax_error_va(token, fmt, va); va_end(va); } void syntax_error(TokenPos pos, char const *fmt, ...) { va_list va; va_start(va, fmt); Token token = {}; token.pos = pos; syntax_error_va(token, fmt, va); va_end(va); } void syntax_warning(Token token, char const *fmt, ...) { va_list va; va_start(va, fmt); syntax_warning_va(token, fmt, va); va_end(va); } void compiler_error(char const *fmt, ...) { va_list va; va_start(va, fmt); gb_printf_err("Internal Compiler Error: %s\n", gb_bprintf_va(fmt, va)); va_end(va); gb_exit(1); } gb_inline bool token_is_literal(TokenKind t) { return gb_is_between(t, Token__LiteralBegin+1, Token__LiteralEnd-1); } gb_inline bool token_is_operator(TokenKind t) { return gb_is_between(t, Token__OperatorBegin+1, Token__OperatorEnd-1); } gb_inline bool token_is_keyword(TokenKind t) { return gb_is_between(t, Token__KeywordBegin+1, Token__KeywordEnd-1); } gb_inline bool token_is_comparison(TokenKind t) { return gb_is_between(t, Token__ComparisonBegin+1, Token__ComparisonEnd-1); } gb_inline bool token_is_shift(TokenKind t) { return t == Token_Shl || t == Token_Shr; } gb_inline void print_token(Token t) { gb_printf("%.*s\n", LIT(t.string)); } enum TokenizerInitError { TokenizerInit_None, TokenizerInit_Invalid, TokenizerInit_NotExists, TokenizerInit_Permission, TokenizerInit_Empty, TokenizerInit_FileTooLarge, TokenizerInit_Count, }; struct TokenizerState { Rune curr_rune; // current character u8 * curr; // character pos u8 * read_curr; // pos from start u8 * line; // current line pos i32 line_count; bool insert_semicolon; }; enum TokenizerFlags { TokenizerFlag_None = 0, TokenizerFlag_InsertSemicolon = 1<<0, }; struct Tokenizer { i32 curr_file_id; String fullpath; u8 *start; u8 *end; Rune curr_rune; // current character u8 * curr; // character pos u8 * read_curr; // pos from start u8 * line; // current line pos i32 line_count; i32 error_count; Array allocated_strings; TokenizerFlags flags; bool insert_semicolon; }; TokenizerState save_tokenizer_state(Tokenizer *t) { TokenizerState state = {}; state.curr_rune = t->curr_rune; state.curr = t->curr; state.read_curr = t->read_curr; state.line = t->line; state.line_count = t->line_count; state.insert_semicolon = t->insert_semicolon; return state; } void restore_tokenizer_state(Tokenizer *t, TokenizerState *state) { t->curr_rune = state->curr_rune; t->curr = state->curr; t->read_curr = state->read_curr; t->line = state->line; t->line_count = state->line_count; t->insert_semicolon = state->insert_semicolon; } void tokenizer_err(Tokenizer *t, char const *msg, ...) { va_list va; isize column = t->read_curr - t->line+1; if (column < 1) { column = 1; } Token token = {}; token.pos.file_id = t->curr_file_id; token.pos.line = t->line_count; token.pos.column = cast(i32)column; va_start(va, msg); syntax_error_va(token, msg, va); va_end(va); t->error_count++; } void tokenizer_err(Tokenizer *t, TokenPos const &pos, char const *msg, ...) { va_list va; isize column = t->read_curr - t->line+1; if (column < 1) { column = 1; } Token token = {}; token.pos = pos; va_start(va, msg); syntax_error_va(token, msg, va); va_end(va); t->error_count++; } void advance_to_next_rune(Tokenizer *t) { if (t->read_curr < t->end) { Rune rune; isize width = 1; t->curr = t->read_curr; if (t->curr_rune == '\n') { t->line = t->curr; t->line_count++; } rune = *t->read_curr; if (rune == 0) { tokenizer_err(t, "Illegal character NUL"); } else if (rune >= 0x80) { // not ASCII width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune); if (rune == GB_RUNE_INVALID && width == 1) { tokenizer_err(t, "Illegal UTF-8 encoding"); } else if (rune == GB_RUNE_BOM && t->curr-t->start > 0){ tokenizer_err(t, "Illegal byte order mark"); } } t->read_curr += width; t->curr_rune = rune; } else { t->curr = t->end; if (t->curr_rune == '\n') { t->line = t->curr; t->line_count++; } t->curr_rune = GB_RUNE_EOF; } } TokenizerInitError init_tokenizer(Tokenizer *t, String fullpath, TokenizerFlags flags = TokenizerFlag_None) { TokenizerInitError err = TokenizerInit_None; char *c_str = alloc_cstring(heap_allocator(), fullpath); defer (gb_free(heap_allocator(), c_str)); // TODO(bill): Memory map rather than copy contents gbFileContents fc = gb_file_read_contents(heap_allocator(), true, c_str); t->flags = flags; t->fullpath = fullpath; t->line_count = 1; if (fc.size > I32_MAX) { err = TokenizerInit_FileTooLarge; gb_file_free_contents(&fc); } else if (fc.data != nullptr) { t->start = cast(u8 *)fc.data; t->line = t->read_curr = t->curr = t->start; t->end = t->start + fc.size; advance_to_next_rune(t); if (t->curr_rune == GB_RUNE_BOM) { advance_to_next_rune(t); // Ignore BOM at file beginning } array_init(&t->allocated_strings, heap_allocator()); } else { gbFile f = {}; gbFileError file_err = gb_file_open(&f, c_str); defer (gb_file_close(&f)); switch (file_err) { case gbFileError_Invalid: err = TokenizerInit_Invalid; break; case gbFileError_NotExists: err = TokenizerInit_NotExists; break; case gbFileError_Permission: err = TokenizerInit_Permission; break; } if (err == TokenizerInit_None && gb_file_size(&f) == 0) { err = TokenizerInit_Empty; } } return err; } gb_inline void destroy_tokenizer(Tokenizer *t) { if (t->start != nullptr) { gb_free(heap_allocator(), t->start); } for_array(i, t->allocated_strings) { gb_free(heap_allocator(), t->allocated_strings[i].text); } array_free(&t->allocated_strings); } gb_inline i32 digit_value(Rune r) { switch (r) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return r - '0'; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': return r - 'a' + 10; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': return r - 'A' + 10; } return 16; // NOTE(bill): Larger than highest possible } gb_inline void scan_mantissa(Tokenizer *t, i32 base) { while (digit_value(t->curr_rune) < base || t->curr_rune == '_') { advance_to_next_rune(t); } } u8 peek_byte(Tokenizer *t, isize offset=0) { if (t->read_curr+offset < t->end) { return t->read_curr[offset]; } return 0; } void scan_number_to_token(Tokenizer *t, Token *token, bool seen_decimal_point) { token->kind = Token_Integer; token->string = {t->curr, 1}; token->pos.file_id = t->curr_file_id; token->pos.line = t->line_count; token->pos.column = cast(i32)(t->curr-t->line+1); if (seen_decimal_point) { token->string.text -= 1; token->string.len += 1; token->pos.column -= 1; token->kind = Token_Float; scan_mantissa(t, 10); goto exponent; } if (t->curr_rune == '0') { u8 *prev = t->curr; advance_to_next_rune(t); switch (t->curr_rune) { case 'b': // Binary advance_to_next_rune(t); scan_mantissa(t, 2); if (t->curr - prev <= 2) { token->kind = Token_Invalid; } goto end; case 'o': // Octal advance_to_next_rune(t); scan_mantissa(t, 8); if (t->curr - prev <= 2) { token->kind = Token_Invalid; } goto end; case 'd': // Decimal advance_to_next_rune(t); scan_mantissa(t, 10); if (t->curr - prev <= 2) { token->kind = Token_Invalid; } goto end; case 'z': // Dozenal advance_to_next_rune(t); scan_mantissa(t, 12); if (t->curr - prev <= 2) { token->kind = Token_Invalid; } goto end; case 'x': // Hexadecimal advance_to_next_rune(t); scan_mantissa(t, 16); if (t->curr - prev <= 2) { token->kind = Token_Invalid; } goto end; case 'h': // Hexadecimal Float token->kind = Token_Float; advance_to_next_rune(t); scan_mantissa(t, 16); if (t->curr - prev <= 2) { token->kind = Token_Invalid; } else { u8 *start = prev+2; isize n = t->curr - start; isize digit_count = 0; for (isize i = 0; i < n; i++) { if (start[i] != '_') { digit_count += 1; } } switch (digit_count) { case 4: case 8: case 16: break; default: tokenizer_err(t, "Invalid hexadecimal float, expected 4, 8, or 16 digits, got %td", digit_count); break; } } goto end; default: scan_mantissa(t, 10); goto fraction; } } scan_mantissa(t, 10); fraction: if (t->curr_rune == '.') { if (peek_byte(t) == '.') { // NOTE(bill): this is kind of ellipsis goto end; } advance_to_next_rune(t); token->kind = Token_Float; scan_mantissa(t, 10); } exponent: if (t->curr_rune == 'e' || t->curr_rune == 'E') { token->kind = Token_Float; advance_to_next_rune(t); if (t->curr_rune == '-' || t->curr_rune == '+') { advance_to_next_rune(t); } scan_mantissa(t, 10); } switch (t->curr_rune) { case 'i': case 'j': case 'k': token->kind = Token_Imag; advance_to_next_rune(t); break; } end: token->string.len = t->curr - token->string.text; return; } bool scan_escape(Tokenizer *t) { isize len = 0; u32 base = 0, max = 0, x = 0; Rune r = t->curr_rune; switch (r) { case 'a': case 'b': case 'e': case 'f': case 'n': case 'r': case 't': case 'v': case '\\': case '\'': case '\"': advance_to_next_rune(t); return true; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': len = 3; base = 8; max = 255; break; case 'x': advance_to_next_rune(t); len = 2; base = 16; max = 255; break; case 'u': advance_to_next_rune(t); len = 4; base = 16; max = GB_RUNE_MAX; break; case 'U': advance_to_next_rune(t); len = 8; base = 16; max = GB_RUNE_MAX; break; default: if (t->curr_rune < 0) { tokenizer_err(t, "Escape sequence was not terminated"); } else { tokenizer_err(t, "Unknown escape sequence"); } return false; } while (len --> 0) { u32 d = cast(u32)digit_value(t->curr_rune); if (d >= base) { if (t->curr_rune < 0) { tokenizer_err(t, "Escape sequence was not terminated"); } else { tokenizer_err(t, "Illegal character %d in escape sequence", t->curr_rune); } return false; } x = x*base + d; advance_to_next_rune(t); } return true; } void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { // Skip whitespace for (;;) { switch (t->curr_rune) { case '\n': if (t->insert_semicolon) { break; } /*fallthrough*/ case ' ': case '\t': case '\r': advance_to_next_rune(t); continue; } break; } token->kind = Token_Invalid; token->string.text = t->curr; token->string.len = 1; token->pos.file_id = t->curr_file_id; token->pos.line = t->line_count; token->pos.offset = cast(i32)(t->curr - t->start); token->pos.column = cast(i32)(t->curr - t->line + 1); TokenPos current_pos = token->pos; bool insert_semicolon = false; Rune curr_rune = t->curr_rune; if (rune_is_letter(curr_rune)) { token->kind = Token_Ident; while (rune_is_letter_or_digit(t->curr_rune)) { advance_to_next_rune(t); } token->string.len = t->curr - token->string.text; // NOTE(bill): Heavily optimize to make it faster to find keywords if (1 < token->string.len && token->string.len <= max_keyword_size && keyword_indices[token->string.len]) { u32 hash = keyword_hash(token->string.text, token->string.len); u32 index = hash & KEYWORD_HASH_TABLE_MASK; KeywordHashEntry *entry = &keyword_hash_table[index]; if (entry->kind != Token_Invalid && entry->hash == hash) { if (str_eq(entry->text, token->string)) { token->kind = entry->kind; if (token->kind == Token_not_in && entry->text == "notin") { syntax_warning(*token, "'notin' is deprecated in favour of 'not_in'"); } } } } switch (token->kind) { case Token_Ident: case Token_context: case Token_typeid: // Dunno? case Token_break: case Token_continue: case Token_fallthrough: case Token_return: insert_semicolon = true; break; } if (t->flags & TokenizerFlag_InsertSemicolon) { t->insert_semicolon = insert_semicolon; } return; } else if (gb_is_between(curr_rune, '0', '9')) { insert_semicolon = true; scan_number_to_token(t, token, false); } else { advance_to_next_rune(t); switch (curr_rune) { case GB_RUNE_EOF: token->kind = Token_EOF; if (t->insert_semicolon) { t->insert_semicolon = false; // EOF consumed token->string = str_lit("\n"); token->kind = Token_Semicolon; return; } break; case '\n': t->insert_semicolon = false; token->string = str_lit("\n"); token->kind = Token_Semicolon; return; case '\\': if (t->flags & TokenizerFlag_InsertSemicolon) { t->insert_semicolon = false; } tokenizer_get_token(t, token); if (token->pos.line == current_pos.line) { tokenizer_err(t, token_pos_add_column(current_pos), "Expected a newline after \\"); } // NOTE(bill): tokenizer_get_token has been called already, return early return; case '\'': // Rune Literal { insert_semicolon = true; token->kind = Token_Rune; Rune quote = curr_rune; bool valid = true; i32 n = 0, success; for (;;) { Rune r = t->curr_rune; if (r == '\n' || r < 0) { tokenizer_err(t, "Rune literal not terminated"); break; } advance_to_next_rune(t); if (r == quote) { break; } n++; if (r == '\\') { if (!scan_escape(t)) { valid = false; } } } // TODO(bill): Better Error Handling if (valid && n != 1) { tokenizer_err(t, "Invalid rune literal"); } token->string.len = t->curr - token->string.text; success = unquote_string(heap_allocator(), &token->string, 0); if (success > 0) { if (success == 2) { array_add(&t->allocated_strings, token->string); } } else { tokenizer_err(t, "Invalid rune literal"); } if (t->flags & TokenizerFlag_InsertSemicolon) { t->insert_semicolon = insert_semicolon; } return; } break; case '`': // Raw String Literal case '"': // String Literal { insert_semicolon = true; bool has_carriage_return = false; i32 success; Rune quote = curr_rune; token->kind = Token_String; if (curr_rune == '"') { for (;;) { Rune r = t->curr_rune; if (r == '\n' || r < 0) { tokenizer_err(t, "String literal not terminated"); break; } advance_to_next_rune(t); if (r == quote) { break; } if (r == '\\') { scan_escape(t); } } } else { for (;;) { Rune r = t->curr_rune; if (r < 0) { tokenizer_err(t, "String literal not terminated"); break; } advance_to_next_rune(t); if (r == quote) { break; } if (r == '\r') { has_carriage_return = true; } } } token->string.len = t->curr - token->string.text; success = unquote_string(heap_allocator(), &token->string, 0, has_carriage_return); if (success > 0) { if (success == 2) { array_add(&t->allocated_strings, token->string); } } else { tokenizer_err(t, "Invalid string literal"); } if (t->flags & TokenizerFlag_InsertSemicolon) { t->insert_semicolon = insert_semicolon; } return; } break; case '.': if (t->curr_rune == '.') { advance_to_next_rune(t); token->kind = Token_Ellipsis; if (t->curr_rune == '<') { advance_to_next_rune(t); token->kind = Token_RangeHalf; } } else if ('0' <= t->curr_rune && t->curr_rune <= '9') { scan_number_to_token(t, token, true); } else { token->kind = Token_Period; } break; case '@': token->kind = Token_At; break; case '$': token->kind = Token_Dollar; break; case '?': insert_semicolon = true; token->kind = Token_Question; break; case '^': insert_semicolon = true; token->kind = Token_Pointer; break; case ';': token->kind = Token_Semicolon; break; case ',': token->kind = Token_Comma; break; case ':': token->kind = Token_Colon; break; case '(': token->kind = Token_OpenParen; break; case ')': insert_semicolon = true; token->kind = Token_CloseParen; break; case '[': token->kind = Token_OpenBracket; break; case ']': insert_semicolon = true; token->kind = Token_CloseBracket; break; case '{': token->kind = Token_OpenBrace; break; case '}': insert_semicolon = true; token->kind = Token_CloseBrace; break; case '%': token->kind = Token_Mod; if (t->curr_rune == '=') { advance_to_next_rune(t); token->kind = Token_ModEq; } else if (t->curr_rune == '%') { token->kind = Token_ModMod; advance_to_next_rune(t); if (t->curr_rune == '=') { token->kind = Token_ModModEq; advance_to_next_rune(t); } } break; case '*': token->kind = Token_Mul; if (t->curr_rune == '=') { advance_to_next_rune(t); token->kind = Token_MulEq; } break; case '=': token->kind = Token_Eq; if (t->curr_rune == '=') { advance_to_next_rune(t); token->kind = Token_CmpEq; } break; case '~': token->kind = Token_Xor; if (t->curr_rune == '=') { advance_to_next_rune(t); token->kind = Token_XorEq; } break; case '!': token->kind = Token_Not; if (t->curr_rune == '=') { advance_to_next_rune(t); token->kind = Token_NotEq; } break; case '+': token->kind = Token_Add; if (t->curr_rune == '=') { advance_to_next_rune(t); token->kind = Token_AddEq; } else if (t->curr_rune == '+') { advance_to_next_rune(t); token->kind = Token_Increment; insert_semicolon = true; } break; case '-': token->kind = Token_Sub; if (t->curr_rune == '=') { advance_to_next_rune(t); token->kind = Token_SubEq; } else if (t->curr_rune == '-' && peek_byte(t) == '-') { advance_to_next_rune(t); advance_to_next_rune(t); token->kind = Token_Undef; } else if (t->curr_rune == '-') { advance_to_next_rune(t); token->kind = Token_Decrement; insert_semicolon = true; }else if (t->curr_rune == '>') { advance_to_next_rune(t); token->kind = Token_ArrowRight; } break; case '#': if (t->curr_rune == '!') { insert_semicolon = t->insert_semicolon; token->kind = Token_Comment; while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) { advance_to_next_rune(t); } } else { token->kind = Token_Hash; } break; case '/': { token->kind = Token_Quo; if (t->curr_rune == '/') { insert_semicolon = t->insert_semicolon; token->kind = Token_Comment; while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) { advance_to_next_rune(t); } } else if (t->curr_rune == '*') { token->kind = Token_Comment; isize comment_scope = 1; advance_to_next_rune(t); while (comment_scope > 0) { if (t->curr_rune == GB_RUNE_EOF) { break; } else if (t->curr_rune == '/') { advance_to_next_rune(t); if (t->curr_rune == '*') { advance_to_next_rune(t); comment_scope++; } } else if (t->curr_rune == '*') { advance_to_next_rune(t); if (t->curr_rune == '/') { advance_to_next_rune(t); comment_scope--; } } else { advance_to_next_rune(t); } } } else if (t->curr_rune == '=') { advance_to_next_rune(t); token->kind = Token_QuoEq; } } break; case '<': token->kind = Token_Lt; if (t->curr_rune == '=') { token->kind = Token_LtEq; advance_to_next_rune(t); } else if (t->curr_rune == '<') { token->kind = Token_Shl; advance_to_next_rune(t); if (t->curr_rune == '=') { token->kind = Token_ShlEq; advance_to_next_rune(t); } } break; case '>': token->kind = Token_Gt; if (t->curr_rune == '=') { token->kind = Token_GtEq; advance_to_next_rune(t); } else if (t->curr_rune == '>') { token->kind = Token_Shr; advance_to_next_rune(t); if (t->curr_rune == '=') { token->kind = Token_ShrEq; advance_to_next_rune(t); } } break; case '&': token->kind = Token_And; if (t->curr_rune == '~') { token->kind = Token_AndNot; advance_to_next_rune(t); if (t->curr_rune == '=') { token->kind = Token_AndNotEq; advance_to_next_rune(t); } } else if (t->curr_rune == '=') { token->kind = Token_AndEq; advance_to_next_rune(t); } else if (t->curr_rune == '&') { token->kind = Token_CmpAnd; advance_to_next_rune(t); if (t->curr_rune == '=') { token->kind = Token_CmpAndEq; advance_to_next_rune(t); } } break; case '|': token->kind = Token_Or; if (t->curr_rune == '=') { token->kind = Token_OrEq; advance_to_next_rune(t); } else if (t->curr_rune == '|') { token->kind = Token_CmpOr; advance_to_next_rune(t); if (t->curr_rune == '=') { token->kind = Token_CmpOrEq; advance_to_next_rune(t); } } break; default: if (curr_rune != GB_RUNE_BOM) { u8 str[4] = {}; int len = cast(int)gb_utf8_encode_rune(str, curr_rune); tokenizer_err(t, "Illegal character: %.*s (%d) ", len, str, curr_rune); } insert_semicolon = t->insert_semicolon; // Preserve insert_semicolon info token->kind = Token_Invalid; break; } } if (t->flags & TokenizerFlag_InsertSemicolon) { t->insert_semicolon = insert_semicolon; } token->string.len = t->curr - token->string.text; return; }