4 years ago · 510d1f2518
--- a/core/text/scanner/scanner.odin
+++ b/core/text/scanner/scanner.odin
@@ -1,3 +1,11 @@
 
				+// package text/scanner provides a scanner and tokenizer for UTF-8-encoded text.
			
 
				+// It takes a string providing the source, which then can be tokenized through
			
 
				+// repeated calls to the scan procedure.
			
 
				+// For compatibility with existing tooling and languages, the NUL character is not allowed.
			
 
				+// If an UTF-8 encoded byte order mark (BOM) is the first character in the first character in the source, it will be discarded.
			
 
				+//
			
 
				+// By default, a Scanner skips white space and Odin comments and recognizes all literals defined by the Odin programming language specification.
			
 
				+// A Scanner may be customized to recognize only a subset of those literals and to recognize different identifiers and white space characters.
			
 
				 package text_scanner
			
 
				 
			
 
				 import "core:fmt"
			
@@ -5,6 +13,8 @@ import "core:strings"
 
				 import "core:unicode"
			
 
				 import "core:unicode/utf8"
			
 
				 
			
 
				+// Position represents a source position
			
 
				+// A position is valid if line > 0
			
 
				 Position :: struct {
			
 
				 	filename: string, // filename, if present
			
 
				 	offset:   int,    // byte offset, starting @ 0
			
@@ -12,6 +22,7 @@ Position :: struct {
 
				 	column:   int,    // column number, starting @ 1 (character count per line)
			
 
				 }
			
 
				 
			
 
				+// position_is_valid reports where the position is valid
			
 
				 position_is_valid :: proc(pos: Position) -> bool {
			
 
				 	return pos.line > 0;
			
 
				 }
			
@@ -43,22 +54,24 @@ Scan_Flag :: enum u32 {
 
				 	Scan_Idents,
			
 
				 	Scan_Ints,
			
 
				 	Scan_C_Int_Prefixes,
			
 
				-	Scan_Floats,
			
 
				+	Scan_Floats, // Includes integers and hexadecimal floats
			
 
				 	Scan_Chars,
			
 
				 	Scan_Strings,
			
 
				 	Scan_Raw_Strings,
			
 
				 	Scan_Comments,
			
 
				-	Skip_Comments,
			
 
				+	Skip_Comments, // if set with .Scan_Comments, comments become white space
			
 
				 }
			
 
				 Scan_Flags :: bit_set[Scan_Flag; u32];
			
 
				 
			
 
				 Odin_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments};
			
 
				 C_Like_Tokens    :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_C_Int_Prefixes, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments};
			
 
				 
			
 
				+// Odin_Whitespace is the default value for the Scanner's whitespace field
			
 
				 Odin_Whitespace :: 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' ';
			
 
				 C_Whitespace    :: 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<'\v' | 1<<'\f' | 1<<' ';
			
 
				 
			
 
				 
			
 
				+// Scanner allows for the reading of Unicode characters and tokens from a string
			
 
				 Scanner :: struct {
			
 
				 	src: string,
			
 
				 
			
@@ -75,20 +88,39 @@ Scanner :: struct {
 
				 	prev_line_len: int,
			
 
				 	prev_char_len: int,
			
 
				 
			
 
				+	// error is called for each error encountered
			
 
				+	// If no error procedure is set, the error is reported to os.stderr
			
 
				 	error: proc(s: ^Scanner, msg: string),
			
 
				+
			
 
				+	// error_count is incremented by one for each error encountered
			
 
				 	error_count: int,
			
 
				 
			
 
				+	// flags controls which tokens are recognized
			
 
				+	// e.g. to recognize integers, set the .Scan_Ints flag
			
 
				+	// This field may be changed by the user at any time during scanning
			
 
				 	flags: Scan_Flags,
			
 
				+
			
 
				+	// The whitespace field controls which characters are recognized as white space
			
 
				+	// This field may be changed by the user at any time during scanning
			
 
				 	whitespace: u64,
			
 
				 
			
 
				+	// is_ident_rune is a predicate controlling the characters accepted as the ith rune in an identifier
			
 
				+	// The valid characters must not conflict with the set of white space characters
			
 
				+	// If is_ident_rune is not set, regular Odin-like identifiers are accepted
			
 
				+	// This field may be changed by the user at any time during scanning
			
 
				 	is_ident_rune: proc(ch: rune, i: int) -> bool,
			
 
				 
			
 
				+	// Start position of most recently scanned token (set by scan(s))
			
 
				+	// Call init or next invalidates the position
			
 
				 	pos: Position,
			
 
				 }
			
 
				 
			
 
				+// init initializes a scanner with a new source and returns itself.
			
 
				+// error_count is set to 0, flags is set to Odin_Like_Tokens, whitespace is set to Odin_Whitespace
			
 
				 init :: proc(s: ^Scanner, src: string, filename := "") -> ^Scanner {
			
 
				 	s^ = {};
			
 
				 
			
 
				+	s.error_count = 0;
			
 
				 	s.src = src;
			
 
				 	s.pos.filename = filename;
			
 
				 
			
@@ -140,6 +172,8 @@ advance :: proc(s: ^Scanner) -> rune {
 
				 	return ch;
			
 
				 }
			
 
				 
			
 
				+// next reads and returns the next Unicode character. It returns EOF at the end of the source.
			
 
				+// next does not update the Scanner's pos field. Use 'position(s)' to get the current position
			
 
				 next :: proc(s: ^Scanner) -> rune {
			
 
				 	s.tok_pos = -1;
			
 
				 	s.pos.line = 0;
			
@@ -150,6 +184,9 @@ next :: proc(s: ^Scanner) -> rune {
 
				 	return ch;
			
 
				 }
			
 
				 
			
 
				+// peek returns the next Unicode character in the source without advancing the scanner
			
 
				+// It returns EOF if the scanner's position is at least the last character of the source
			
 
				+// if n > 0, it call next n times and return the nth Unicode character and then restore the Scanner's state
			
 
				 peek :: proc(s: ^Scanner, n := 0) -> (ch: rune) {
			
 
				 	if s.ch == -2 {
			
 
				 		s.ch = advance(s);
			
@@ -168,7 +205,9 @@ peek :: proc(s: ^Scanner, n := 0) -> (ch: rune) {
 
				 	}
			
 
				 	return ch;
			
 
				 }
			
 
				-
			
 
				+// peek returns the next token in the source
			
 
				+// It returns EOF if the scanner's position is at least the last character of the source
			
 
				+// if n > 0, it call next n times and return the nth token and then restore the Scanner's state
			
 
				 peek_token :: proc(s: ^Scanner, n := 0) -> (tok: rune) {
			
 
				 	assert(n >= 0);
			
 
				 	prev_s := s^;
			
@@ -469,6 +508,10 @@ scan_comment :: proc(s: ^Scanner, ch: rune) -> rune {
 
				 	return ch;
			
 
				 }
			
 
				 
			
 
				+// scan reads the next token or Unicode character from source and returns it
			
 
				+// It only recognizes tokens for which the respective flag that is set
			
 
				+// It returns EOF at the end of the source
			
 
				+// It reports Scanner errors by calling s.error, if not nil; otherwise it will print the error message to os.stderr
			
 
				 scan :: proc(s: ^Scanner) -> (tok: rune) {
			
 
				 	ch := peek(s);
			
 
				 	if ch == EOF {
			
@@ -563,6 +606,8 @@ scan :: proc(s: ^Scanner) -> (tok: rune) {
 
				 	return tok;
			
 
				 }
			
 
				 
			
 
				+// position returns the position of the character immediately after the character or token returns by the previous call to next or scan
			
 
				+// Use the Scanner's position field for the most recently scanned token position
			
 
				 position :: proc(s: ^Scanner) -> Position {
			
 
				 	pos: Position;
			
 
				 	pos.filename = s.pos.filename;
			
@@ -581,6 +626,7 @@ position :: proc(s: ^Scanner) -> Position {
 
				 	return pos;
			
 
				 }
			
 
				 
			
 
				+// token_text returns the string of the most recently scanned token
			
 
				 token_text :: proc(s: ^Scanner) -> string {
			
 
				 	if s.tok_pos < 0 {
			
 
				 		return "";
			
@@ -588,6 +634,8 @@ token_text :: proc(s: ^Scanner) -> string {
 
				 	return string(s.src[s.tok_pos:s.tok_end]);
			
 
				 }
			
 
				 
			
 
				+// token_string returns a printable string for a token or Unicode character
			
 
				+// By default, it uses the context.temp_allocator to produce the string
			
 
				 token_string :: proc(tok: rune, allocator := context.temp_allocator) -> string {
			
 
				 	context.allocator = allocator;
			
 
				 	switch tok {