4 years ago · be76da2c90
--- a/src/checker.cpp
+++ b/src/checker.cpp
@@ -3512,7 +3512,7 @@ bool is_string_an_identifier(String s) {
 
				 	while (offset < s.len) {
			
 
				 		bool ok = false;
			
 
				 		Rune r = -1;
			
 
				-		isize size = gb_utf8_decode(s.text+offset, s.len-offset, &r);
			
 
				+		isize size = utf8_decode(s.text+offset, s.len-offset, &r);
			
 
				 		if (offset == 0) {
			
 
				 			ok = rune_is_letter(r);
			
 
				 		} else {
			
--- a/src/exact_value.cpp
+++ b/src/exact_value.cpp
@@ -317,7 +317,7 @@ ExactValue exact_value_from_basic_literal(Token token) {
 
				 	}
			
 
				 	case Token_Rune: {
			
 
				 		Rune r = GB_RUNE_INVALID;
			
 
				-		gb_utf8_decode(token.string.text, token.string.len, &r);
			
 
				+		utf8_decode(token.string.text, token.string.len, &r);
			
 
				 		return exact_value_i64(r);
			
 
				 	}
			
 
				 	default:
			
--- a/src/gb/gb.h
+++ b/src/gb/gb.h
@@ -8232,7 +8232,9 @@ gbFileContents gb_file_read_contents(gbAllocator a, b32 zero_terminate, char con
 
				 	if (gb_file_open(&file, filepath) == gbFileError_None) {
			
 
				 		isize file_size = cast(isize)gb_file_size(&file);
			
 
				 		if (file_size > 0) {
			
 
				-			result.data = gb_alloc(a, zero_terminate ? file_size+1 : file_size);
			
 
				+			isize total_size = file_size + !!zero_terminate;
			
 
				+			total_size = (total_size+7)&~7;
			
 
				+			result.data = gb_alloc(a, total_size);
			
 
				 			result.size = file_size;
			
 
				 			gb_file_read_at(&file, result.data, result.size, 0);
			
 
				 			if (zero_terminate) {
			
--- a/src/llvm_backend.cpp
+++ b/src/llvm_backend.cpp
@@ -4712,7 +4712,7 @@ void lb_build_unroll_range_stmt(lbProcedure *p, AstUnrollRangeStmt *rs, Scope *s
 
				 				Rune codepoint = 0;
			
 
				 				isize offset = 0;
			
 
				 				do {
			
 
				-					isize width = gb_utf8_decode(str.text+offset, str.len-offset, &codepoint);
			
 
				+					isize width = utf8_decode(str.text+offset, str.len-offset, &codepoint);
			
 
				 					if (val0_type) lb_addr_store(p, val0_addr, lb_const_value(m, val0_type, exact_value_i64(codepoint)));
			
 
				 					if (val1_type) lb_addr_store(p, val1_addr, lb_const_value(m, val1_type, exact_value_i64(offset)));
			
 
				 					lb_build_stmt(p, rs->body);
			
@@ -6563,7 +6563,7 @@ lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, bool allow_loc
 
				 			LLVMValueRef *elems = gb_alloc_array(permanent_allocator(), LLVMValueRef, cast(isize)count);
			
 
				 
			
 
				 			for (i64 i = 0; i < count && offset < s.len; i++) {
			
 
				-				width = gb_utf8_decode(s.text+offset, s.len-offset, &rune);
			
 
				+				width = utf8_decode(s.text+offset, s.len-offset, &rune);
			
 
				 				offset += width;
			
 
				 
			
 
				 				elems[i] = LLVMConstInt(et, rune, true);
			
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -561,7 +561,7 @@ bool string_is_valid_identifier(String str) {
 
				 	isize offset = 0;
			
 
				 	while (offset < str.len) {
			
 
				 		Rune r = 0;
			
 
				-		w = gb_utf8_decode(str.text, str.len, &r);
			
 
				+		w = utf8_decode(str.text, str.len, &r);
			
 
				 		if (r == GB_RUNE_INVALID) {
			
 
				 			return false;
			
 
				 		}
			
--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -1705,7 +1705,7 @@ bool is_foreign_name_valid(String name) {
 
				 	while (offset < name.len) {
			
 
				 		Rune rune;
			
 
				 		isize remaining = name.len - offset;
			
 
				-		isize width = gb_utf8_decode(name.text+offset, remaining, &rune);
			
 
				+		isize width = utf8_decode(name.text+offset, remaining, &rune);
			
 
				 		if (rune == GB_RUNE_INVALID && width == 1) {
			
 
				 			return false;
			
 
				 		} else if (rune == GB_RUNE_BOM && remaining > 0) {
			
@@ -4612,7 +4612,7 @@ ParseFileError init_ast_file(AstFile *f, String fullpath, TokenPos *err_pos) {
 
				 
			
 
				 	u64 start = time_stamp_time_now();
			
 
				 
			
 
				-	while (f->curr_token.kind != Token_EOF) {
			
 
				+	for (;;) {
			
 
				 		Token *token = array_add_and_get(&f->tokens);
			
 
				 		tokenizer_get_token(&f->tokenizer, token);
			
 
				 		if (token->kind == Token_Invalid) {
			
@@ -4887,7 +4887,7 @@ bool is_import_path_valid(String path) {
 
				 			isize width = 1;
			
 
				 			Rune r = *curr;
			
 
				 			if (r >= 0x80) {
			
 
				-				width = gb_utf8_decode(curr, end-curr, &r);
			
 
				+				width = utf8_decode(curr, end-curr, &r);
			
 
				 				if (r == GB_RUNE_INVALID && width == 1) {
			
 
				 					return false;
			
 
				 				}
			
@@ -4920,7 +4920,7 @@ bool is_build_flag_path_valid(String path) {
 
				 			isize width = 1;
			
 
				 			Rune r = *curr;
			
 
				 			if (r >= 0x80) {
			
 
				-				width = gb_utf8_decode(curr, end-curr, &r);
			
 
				+				width = utf8_decode(curr, end-curr, &r);
			
 
				 				if (r == GB_RUNE_INVALID && width == 1) {
			
 
				 					return false;
			
 
				 				}
			
@@ -5170,7 +5170,7 @@ String build_tag_get_token(String s, String *out) {
 
				 	isize n = 0;
			
 
				 	while (n < s.len) {
			
 
				 		Rune rune = 0;
			
 
				-		isize width = gb_utf8_decode(&s[n], s.len-n, &rune);
			
 
				+		isize width = utf8_decode(&s[n], s.len-n, &rune);
			
 
				 		if (n == 0 && rune == '!') {
			
 
				 
			
 
				 		} else if (!rune_is_letter(rune) && !rune_is_digit(rune)) {
			
--- a/src/string.cpp
+++ b/src/string.cpp
@@ -500,7 +500,7 @@ String quote_to_ascii(gbAllocator a, String str, u8 quote='"') {
 
				 		Rune r = cast(Rune)s[0];
			
 
				 		width = 1;
			
 
				 		if (r >= 0x80) {
			
 
				-			width = gb_utf8_decode(s, n, &r);
			
 
				+			width = utf8_decode(s, n, &r);
			
 
				 		}
			
 
				 		if (width == 1 && r == GB_RUNE_INVALID) {
			
 
				 			array_add(&buf, cast(u8)'\\');
			
@@ -576,7 +576,7 @@ bool unquote_char(String s, u8 quote, Rune *rune, bool *multiple_bytes, String *
 
				 		return false;
			
 
				 	} else if (s[0] >= 0x80) {
			
 
				 		Rune r = -1;
			
 
				-		isize size = gb_utf8_decode(s.text, s.len, &r);
			
 
				+		isize size = utf8_decode(s.text, s.len, &r);
			
 
				 		*rune = r;
			
 
				 		*multiple_bytes = true;
			
 
				 		*tail_string = make_string(s.text+size, s.len-size);
			
@@ -736,7 +736,7 @@ i32 unquote_string(gbAllocator a, String *s_, u8 quote=0, bool has_carriage_retu
 
				 			return 1;
			
 
				 		} else if (quote == '\'') {
			
 
				 			Rune r = GB_RUNE_INVALID;
			
 
				-			isize size = gb_utf8_decode(s.text, s.len, &r);
			
 
				+			isize size = utf8_decode(s.text, s.len, &r);
			
 
				 			if ((size == s.len) && (r != -1 || size != 1)) {
			
 
				 				*s_ = s;
			
 
				 				return 1;
			
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -789,26 +789,27 @@ void tokenizer_err(Tokenizer *t, TokenPos const &pos, char const *msg, ...) {
 
				 
			
 
				 void advance_to_next_rune(Tokenizer *t) {
			
 
				 	if (t->read_curr < t->end) {
			
 
				-		Rune rune;
			
 
				-		isize width = 1;
			
 
				-
			
 
				 		t->curr = t->read_curr;
			
 
				 		if (t->curr_rune == '\n') {
			
 
				 			t->line = t->curr;
			
 
				 			t->line_count++;
			
 
				 		}
			
 
				-		rune = *t->read_curr;
			
 
				+
			
 
				+		Rune rune = *t->read_curr;
			
 
				 		if (rune == 0) {
			
 
				 			tokenizer_err(t, "Illegal character NUL");
			
 
				-		} else if (rune >= 0x80) { // not ASCII
			
 
				-			width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune);
			
 
				+			t->read_curr++;
			
 
				+		} else if (rune & 0x80) { // not ASCII
			
 
				+			isize width = utf8_decode(t->read_curr, t->end-t->read_curr, &rune);
			
 
				+			t->read_curr += width;
			
 
				 			if (rune == GB_RUNE_INVALID && width == 1) {
			
 
				 				tokenizer_err(t, "Illegal UTF-8 encoding");
			
 
				 			} else if (rune == GB_RUNE_BOM && t->curr-t->start > 0){
			
 
				 				tokenizer_err(t, "Illegal byte order mark");
			
 
				 			}
			
 
				+		} else {
			
 
				+			t->read_curr++;
			
 
				 		}
			
 
				-		t->read_curr += width;
			
 
				 		t->curr_rune = rune;
			
 
				 	} else {
			
 
				 		t->curr = t->end;
			
@@ -820,7 +821,28 @@ void advance_to_next_rune(Tokenizer *t) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-TokenizerInitError init_tokenizer(Tokenizer *t, String fullpath, TokenizerFlags flags = TokenizerFlag_None) {
			
 
				+void init_tokenizer_with_file_contents(Tokenizer *t, String const &fullpath, gbFileContents *fc, TokenizerFlags flags) {
			
 
				+	t->flags = flags;
			
 
				+	t->fullpath = fullpath;
			
 
				+	t->line_count = 1;
			
 
				+
			
 
				+	t->start = cast(u8 *)fc->data;
			
 
				+	t->line = t->read_curr = t->curr = t->start;
			
 
				+	t->end = t->start + fc->size;
			
 
				+
			
 
				+	advance_to_next_rune(t);
			
 
				+	if (t->curr_rune == GB_RUNE_BOM) {
			
 
				+		advance_to_next_rune(t); // Ignore BOM at file beginning
			
 
				+	}
			
 
				+
			
 
				+	if (t->allocated_strings.count != 0) {
			
 
				+		array_clear(&t->allocated_strings);
			
 
				+	} else {
			
 
				+		array_init(&t->allocated_strings, heap_allocator());
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+TokenizerInitError init_tokenizer(Tokenizer *t, String const &fullpath, TokenizerFlags flags = TokenizerFlag_None) {
			
 
				 	TokenizerInitError err = TokenizerInit_None;
			
 
				 
			
 
				 	char *c_str = alloc_cstring(heap_allocator(), fullpath);
			
@@ -829,25 +851,18 @@ TokenizerInitError init_tokenizer(Tokenizer *t, String fullpath, TokenizerFlags
 
				 	// TODO(bill): Memory map rather than copy contents
			
 
				 	gbFileContents fc = gb_file_read_contents(heap_allocator(), true, c_str);
			
 
				 
			
 
				-	t->flags = flags;
			
 
				-	t->fullpath = fullpath;
			
 
				-	t->line_count = 1;
			
 
				-
			
 
				 	if (fc.size > I32_MAX) {
			
 
				+		t->flags = flags;
			
 
				+		t->fullpath = fullpath;
			
 
				+		t->line_count = 1;
			
 
				 		err = TokenizerInit_FileTooLarge;
			
 
				 		gb_file_free_contents(&fc);
			
 
				 	} else if (fc.data != nullptr) {
			
 
				-		t->start = cast(u8 *)fc.data;
			
 
				-		t->line = t->read_curr = t->curr = t->start;
			
 
				-		t->end = t->start + fc.size;
			
 
				-
			
 
				-		advance_to_next_rune(t);
			
 
				-		if (t->curr_rune == GB_RUNE_BOM) {
			
 
				-			advance_to_next_rune(t); // Ignore BOM at file beginning
			
 
				-		}
			
 
				-
			
 
				-		array_init(&t->allocated_strings, heap_allocator());
			
 
				+		init_tokenizer_with_file_contents(t, fullpath, &fc, flags);
			
 
				 	} else {
			
 
				+		t->flags = flags;
			
 
				+		t->fullpath = fullpath;
			
 
				+		t->line_count = 1;
			
 
				 		gbFile f = {};
			
 
				 		gbFileError file_err = gb_file_open(&f, c_str);
			
 
				 		defer (gb_file_close(&f));
			
@@ -1093,8 +1108,24 @@ bool scan_escape(Tokenizer *t) {
 
				 }
			
 
				 
			
 
				 
			
 
				-void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) {
			
 
				+gb_inline void tokenizer_skip_line(Tokenizer *t) {
			
 
				+#if 0
			
 
				+	while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) {
			
 
				+		advance_to_next_rune(t);
			
 
				+	}
			
 
				+#else
			
 
				+	while (t->read_curr != t->end && t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) {
			
 
				+		t->curr = t->read_curr;
			
 
				+		t->curr_rune = *t->read_curr;
			
 
				+		if (t->curr_rune == 0) {
			
 
				+			tokenizer_err(t, "Illegal character NUL");
			
 
				+		}
			
 
				+		t->read_curr++;
			
 
				+	}
			
 
				+#endif
			
 
				+}
			
 
				 
			
 
				+void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) {
			
 
				 	// Skip whitespace
			
 
				 	if (t->flags & TokenizerFlag_InsertSemicolon && t->insert_semicolon) {
			
 
				 		for (;;) {
			
@@ -1405,10 +1436,7 @@ void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) {
 
				 			token->kind = Token_Hash;
			
 
				 			if (t->curr_rune == '!') {
			
 
				 				token->kind = Token_Comment;
			
 
				-
			
 
				-				while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) {
			
 
				-					advance_to_next_rune(t);
			
 
				-				}
			
 
				+				tokenizer_skip_line(t);
			
 
				 			}
			
 
				 			break;
			
 
				 		case '/':
			
@@ -1416,9 +1444,7 @@ void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) {
 
				 			switch (t->curr_rune) {
			
 
				 			case '/':
			
 
				 				token->kind = Token_Comment;
			
 
				-				while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) {
			
 
				-					advance_to_next_rune(t);
			
 
				-				}
			
 
				+				tokenizer_skip_line(t);
			
 
				 				break;
			
 
				 			case '*':
			
 
				 				token->kind = Token_Comment;
			
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -65,3 +65,100 @@ bool rune_is_whitespace(Rune r) {
 
				 	}
			
 
				 	return false;
			
 
				 }
			
 
				+
			
 
				+
			
 
				+gb_global u8 const global__utf8_first[256] = {
			
 
				+	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x00-0x0F
			
 
				+	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x10-0x1F
			
 
				+	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x20-0x2F
			
 
				+	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x30-0x3F
			
 
				+	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x40-0x4F
			
 
				+	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x50-0x5F
			
 
				+	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x60-0x6F
			
 
				+	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // 0x70-0x7F
			
 
				+	0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x80-0x8F
			
 
				+	0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0x90-0x9F
			
 
				+	0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xA0-0xAF
			
 
				+	0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xB0-0xBF
			
 
				+	0xf1, 0xf1, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xC0-0xCF
			
 
				+	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, // 0xD0-0xDF
			
 
				+	0x13, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x23, 0x03, 0x03, // 0xE0-0xEF
			
 
				+	0x34, 0x04, 0x04, 0x04, 0x44, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, 0xf1, // 0xF0-0xFF
			
 
				+};
			
 
				+
			
 
				+typedef struct Utf8AcceptRange {
			
 
				+	u8 lo, hi;
			
 
				+} Utf8AcceptRange;
			
 
				+
			
 
				+gb_global Utf8AcceptRange const global__utf8_accept_ranges[] = {
			
 
				+	{0x80, 0xbf},
			
 
				+	{0xa0, 0xbf},
			
 
				+	{0x80, 0x9f},
			
 
				+	{0x90, 0xbf},
			
 
				+	{0x80, 0x8f},
			
 
				+};
			
 
				+
			
 
				+
			
 
				+isize utf8_decode(u8 const *str, isize str_len, Rune *codepoint_out) {
			
 
				+	isize width = 0;
			
 
				+	Rune codepoint = GB_RUNE_INVALID;
			
 
				+
			
 
				+	if (str_len > 0) {
			
 
				+		u8 s0 = str[0];
			
 
				+		u8 x = global__utf8_first[s0], sz;
			
 
				+		u8 b1, b2, b3;
			
 
				+		Utf8AcceptRange accept;
			
 
				+		if (x >= 0xf0) {
			
 
				+			Rune mask = (cast(Rune)x << 31) >> 31;
			
 
				+			codepoint = (cast(Rune)s0 & (~mask)) | (GB_RUNE_INVALID & mask);
			
 
				+			width = 1;
			
 
				+			goto end;
			
 
				+		}
			
 
				+		if (s0 < 0x80) {
			
 
				+			codepoint = s0;
			
 
				+			width = 1;
			
 
				+			goto end;
			
 
				+		}
			
 
				+
			
 
				+		sz = x&7;
			
 
				+		accept = global__utf8_accept_ranges[x>>4];
			
 
				+		if (str_len < gb_size_of(sz))
			
 
				+			goto invalid_codepoint;
			
 
				+
			
 
				+		b1 = str[1];
			
 
				+		if (b1 < accept.lo || accept.hi < b1)
			
 
				+			goto invalid_codepoint;
			
 
				+
			
 
				+		if (sz == 2) {
			
 
				+			codepoint = (cast(Rune)s0&0x1f)<<6 | (cast(Rune)b1&0x3f);
			
 
				+			width = 2;
			
 
				+			goto end;
			
 
				+		}
			
 
				+
			
 
				+		b2 = str[2];
			
 
				+		if (!gb_is_between(b2, 0x80, 0xbf))
			
 
				+			goto invalid_codepoint;
			
 
				+
			
 
				+		if (sz == 3) {
			
 
				+			codepoint = (cast(Rune)s0&0x1f)<<12 | (cast(Rune)b1&0x3f)<<6 | (cast(Rune)b2&0x3f);
			
 
				+			width = 3;
			
 
				+			goto end;
			
 
				+		}
			
 
				+
			
 
				+		b3 = str[3];
			
 
				+		if (!gb_is_between(b3, 0x80, 0xbf))
			
 
				+			goto invalid_codepoint;
			
 
				+
			
 
				+		codepoint = (cast(Rune)s0&0x07)<<18 | (cast(Rune)b1&0x3f)<<12 | (cast(Rune)b2&0x3f)<<6 | (cast(Rune)b3&0x3f);
			
 
				+		width = 4;
			
 
				+		goto end;
			
 
				+
			
 
				+	invalid_codepoint:
			
 
				+		codepoint = GB_RUNE_INVALID;
			
 
				+		width = 1;
			
 
				+	}
			
 
				+
			
 
				+end:
			
 
				+	if (codepoint_out) *codepoint_out = codepoint;
			
 
				+	return width;
			
 
				+}