123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340 |
- package bufio
- import "core:bytes"
- import "core:io"
- import "core:mem"
- import "core:unicode/utf8"
- import "core:intrinsics"
- // Extra errors returns by scanning procedures
- Scanner_Extra_Error :: enum i32 {
- Negative_Advance,
- Advanced_Too_Far,
- Bad_Read_Count,
- Too_Long,
- Too_Short,
- }
- Scanner_Error :: union {
- io.Error,
- Scanner_Extra_Error,
- }
- // Split_Proc is the signature of the split procedure used to tokenize the input.
- Split_Proc :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool);
- Scanner :: struct {
- r: io.Reader,
- split: Split_Proc,
- buf: [dynamic]byte,
- max_token_size: int,
- start: int,
- end: int,
- token: []byte,
- _err: Scanner_Error,
- max_consecutive_empty_reads: int,
- successive_empty_token_count: int,
- scan_called: bool,
- done: bool,
- }
- DEFAULT_MAX_SCAN_TOKEN_SIZE :: 1<<16;
- @(private)
- _INIT_BUF_SIZE :: 4096;
- scanner_init :: proc(s: ^Scanner, r: io.Reader, buf_allocator := context.allocator) -> ^Scanner {
- s.r = r;
- s.split = scan_lines;
- s.max_token_size = DEFAULT_MAX_SCAN_TOKEN_SIZE;
- s.buf.allocator = buf_allocator;
- return s;
- }
- scanner_init_with_buffer :: proc(s: ^Scanner, r: io.Reader, buf: []byte) -> ^Scanner {
- s.r = r;
- s.split = scan_lines;
- s.max_token_size = DEFAULT_MAX_SCAN_TOKEN_SIZE;
- s.buf = mem.buffer_from_slice(buf);
- resize(&s.buf, cap(s.buf));
- return s;
- }
- scanner_destroy :: proc(s: ^Scanner) {
- delete(s.buf);
- }
- // Returns the first non-EOF error that was encounted by the scanner
- scanner_error :: proc(s: ^Scanner) -> Scanner_Error {
- switch s._err {
- case .EOF, .None:
- return nil;
- }
- return s._err;
- }
- // Returns the most recent token created by scanner_scan.
- // The underlying array may point to data that may be overwritten
- // by another call to scanner_scan.
- // Treat the returned value as if it is immutable.
- scanner_bytes :: proc(s: ^Scanner) -> []byte {
- return s.token;
- }
- // Returns the most recent token created by scanner_scan.
- // The underlying array may point to data that may be overwritten
- // by another call to scanner_scan.
- // Treat the returned value as if it is immutable.
- scanner_text :: proc(s: ^Scanner) -> string {
- return string(s.token);
- }
- // scanner_scan advances the scanner
- scanner_scan :: proc(s: ^Scanner) -> bool {
- set_err :: proc(s: ^Scanner, err: Scanner_Error) {
- err := err;
- if err == .None {
- err = nil;
- }
- switch s._err {
- case nil, .EOF:
- s._err = err;
- }
- }
- if s.done {
- return false;
- }
- s.scan_called = true;
- for {
- // Check if a token is possible with what is available
- // Allow the split procedure to recover if it fails
- if s.start < s.end || s._err != nil {
- advance, token, err, final_token := s.split(s.buf[s.start:s.end], s._err != nil);
- if final_token {
- s.token = token;
- s.done = true;
- return true;
- }
- if err != nil {
- set_err(s, err);
- return false;
- }
- // Do advance
- if advance < 0 {
- set_err(s, .Negative_Advance);
- return false;
- }
- if advance > s.end-s.start {
- set_err(s, .Advanced_Too_Far);
- return false;
- }
- s.start += advance;
- s.token = token;
- if s.token != nil {
- if s._err == nil || advance > 0 {
- s.successive_empty_token_count = 0;
- } else {
- s.successive_empty_token_count += 1;
- if s.max_consecutive_empty_reads <= 0 {
- s.max_consecutive_empty_reads = DEFAULT_MAX_CONSECUTIVE_EMPTY_READS;
- }
- if s.successive_empty_token_count > s.max_consecutive_empty_reads {
- set_err(s, .No_Progress);
- return false;
- }
- }
- return true;
- }
- }
- // If an error is hit, no token can be created
- if s._err != nil {
- s.start = 0;
- s.end = 0;
- return false;
- }
- // More data must be required to be read
- if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) {
- copy(s.buf[:], s.buf[s.start:s.end]);
- s.end -= s.start;
- s.start = 0;
- }
- could_be_too_short := false;
- // Resize the buffer if full
- if s.end == len(s.buf) {
- if s.max_token_size <= 0 {
- s.max_token_size = DEFAULT_MAX_SCAN_TOKEN_SIZE;
- }
- if len(s.buf) >= s.max_token_size {
- set_err(s, .Too_Long);
- return false;
- }
- // overflow check
- new_size := _INIT_BUF_SIZE;
- if len(s.buf) > 0 {
- overflowed: bool;
- if new_size, overflowed = intrinsics.overflow_mul(len(s.buf), 2); overflowed {
- set_err(s, .Too_Long);
- return false;
- }
- }
- old_size := len(s.buf);
- new_size = min(new_size, s.max_token_size);
- resize(&s.buf, new_size);
- s.end -= s.start;
- s.start = 0;
- could_be_too_short = old_size >= len(s.buf);
- }
- // Read data into the buffer
- loop := 0;
- for {
- n, err := io.read(s.r, s.buf[s.end:len(s.buf)]);
- if n < 0 || len(s.buf)-s.end < n {
- set_err(s, .Bad_Read_Count);
- break;
- }
- s.end += n;
- if err != nil {
- set_err(s, err);
- break;
- }
- if n > 0 {
- s.successive_empty_token_count = 0;
- break;
- }
- loop += 1;
- if s.max_consecutive_empty_reads <= 0 {
- s.max_consecutive_empty_reads = DEFAULT_MAX_CONSECUTIVE_EMPTY_READS;
- }
- if loop > s.max_consecutive_empty_reads {
- if could_be_too_short {
- set_err(s, .Too_Short);
- } else {
- set_err(s, .No_Progress);
- }
- break;
- }
- }
- }
- }
- scan_bytes :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
- if at_eof && len(data) == 0 {
- return;
- }
- return 1, data[0:1], nil, false;
- }
- scan_runes :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
- if at_eof && len(data) == 0 {
- return;
- }
- if data[0] < utf8.RUNE_SELF {
- advance = 1;
- token = data[0:1];
- return;
- }
- _, width := utf8.decode_rune(data);
- if width > 1 {
- advance = width;
- token = data[0:width];
- return;
- }
- if !at_eof && !utf8.full_rune(data) {
- return;
- }
- @thread_local ERROR_RUNE := []byte{0xef, 0xbf, 0xbd};
- advance = 1;
- token = ERROR_RUNE;
- return;
- }
- scan_words :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
- is_space :: proc "contextless" (r: rune) -> bool {
- switch r {
- // lower ones
- case ' ', '\t', '\n', '\v', '\f', '\r':
- return true;
- case '\u0085', '\u00a0':
- return true;
- // higher ones
- case '\u2000' ..= '\u200a':
- return true;
- case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000':
- return true;
- }
- return false;
- }
- // skip spaces at the beginning
- start := 0;
- for width := 0; start < len(data); start += width {
- r: rune;
- r, width = utf8.decode_rune(data[start:]);
- if !is_space(r) {
- break;
- }
- }
- for width, i := 0, start; i < len(data); i += width {
- r: rune;
- r, width = utf8.decode_rune(data[i:]);
- if is_space(r) {
- advance = i+width;
- token = data[start:i];
- return;
- }
- }
- if at_eof && len(data) > start {
- advance = len(data);
- token = data[start:];
- return;
- }
- advance = start;
- return;
- }
- scan_lines :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) {
- trim_carriage_return :: proc "contextless" (data: []byte) -> []byte {
- if len(data) > 0 && data[len(data)-1] == '\r' {
- return data[0:len(data)-1];
- }
- return data;
- }
- if at_eof && len(data) == 0 {
- return;
- }
- if i := bytes.index_byte(data, '\n'); i >= 0 {
- advance = i+1;
- token = trim_carriage_return(data[0:i]);
- return;
- }
- if at_eof {
- advance = len(data);
- token = trim_carriage_return(data);
- }
- return;
- }
|