4 år sedan · da380d6fc4
--- a/core/encoding/csv/reader.odin
+++ b/core/encoding/csv/reader.odin
@@ -0,0 +1,406 @@
 
				+// package csv reads and writes comma-separated values (CSV) files.
			
 
				+// This package supports the format described in RFC 4180 <https://tools.ietf.org/html/rfc4180.html>
			
 
				+package csv
			
 
				+
			
 
				+import "core:bufio"
			
 
				+import "core:bytes"
			
 
				+import "core:io"
			
 
				+import "core:strings"
			
 
				+import "core:unicode/utf8"
			
 
				+
			
 
				+// Reader is a data structure used for reading records from a CSV-encoded file
			
 
				+//
			
 
				+// The associated procedures for Reader expects its input to conform to RFC 4180.
			
 
				+Reader :: struct {
			
 
				+	// comma is the field delimiter
			
 
				+	// reader_init will set it to be ','
			
 
				+	// A "comma" must be a valid rune, nor can it be \r, \n, or the Unicode replacement character (0xfffd)
			
 
				+	comma: rune,
			
 
				+
			
 
				+	// comment, if not 0, is the comment character
			
 
				+	// Lines beginning with the comment character without a preceding whitespace are ignored
			
 
				+	comment: rune,
			
 
				+
			
 
				+	// fields_per_record is the number of expected fields per record
			
 
				+	//         if fields_per_record is >0, 'read' requires each record to have that field count
			
 
				+	//         if fields_per_record is  0, 'read' sets it to the field count in the first record
			
 
				+	//         if fields_per_record is <0, no check is made and records may have a variable field count
			
 
				+	fields_per_record: int,
			
 
				+
			
 
				+	// If trim_leading_space is true, leading whitespace in a field is ignored
			
 
				+	// This is done even if the field delimiter (comma), is whitespace
			
 
				+	trim_leading_space: bool,
			
 
				+
			
 
				+	// If lazy_quotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field
			
 
				+	lazy_quotes: bool,
			
 
				+
			
 
				+	// reuse_record controls whether calls to 'read' may return a slice using the backing buffer
			
 
				+	// for performance
			
 
				+	// By default, each call to 'read' returns a newly allocated slice
			
 
				+	reuse_record: bool,
			
 
				+
			
 
				+	// reuse_record_buffer controls whether calls to 'read' clone the strings of each field or uses
			
 
				+	// the data stored in record buffer for performance
			
 
				+	// By default, each call to 'read' clones the strings of each field
			
 
				+	reuse_record_buffer: bool,
			
 
				+
			
 
				+
			
 
				+	// internal buffers
			
 
				+	r:             bufio.Reader,
			
 
				+	line_count:    int, // current line being read in the CSV file
			
 
				+	raw_buffer:    [dynamic]byte,
			
 
				+	record_buffer: [dynamic]byte,
			
 
				+	field_indices: [dynamic]int,
			
 
				+	last_record:   [dynamic]string,
			
 
				+	sr: strings.Reader, // used by reader_init_with_string
			
 
				+}
			
 
				+
			
 
				+
			
 
				+Parser_Error_Kind :: enum {
			
 
				+	Bare_Quote,
			
 
				+	Quote,
			
 
				+	Field_Count,
			
 
				+	Invalid_Delim,
			
 
				+}
			
 
				+
			
 
				+parser_error_kind_string := [Parser_Error_Kind]string{
			
 
				+	.Bare_Quote     = "bare \" in non-quoted field",
			
 
				+	.Quote          = "extra or missing \" in quoted field",
			
 
				+	.Field_Count    = "wrong field count",
			
 
				+	.Invalid_Delim  = "invalid delimiter",
			
 
				+};
			
 
				+
			
 
				+Parser_Error :: struct {
			
 
				+	kind:          Parser_Error_Kind,
			
 
				+	start_line:    int,
			
 
				+	line:          int,
			
 
				+	column:        int,
			
 
				+	expected, got: int, // used by .Field_Count
			
 
				+}
			
 
				+
			
 
				+Error :: union {
			
 
				+	Parser_Error,
			
 
				+	io.Error,
			
 
				+}
			
 
				+
			
 
				+DEFAULT_RECORD_BUFFER_CAPACITY :: 256;
			
 
				+
			
 
				+// reader_init initializes a new Reader from r
			
 
				+reader_init :: proc(reader: ^Reader, r: io.Reader, buffer_allocator := context.allocator) {
			
 
				+	reader.comma = ',';
			
 
				+
			
 
				+	context.allocator = buffer_allocator;
			
 
				+	reserve(&reader.record_buffer, DEFAULT_RECORD_BUFFER_CAPACITY);
			
 
				+	reserve(&reader.raw_buffer,    0);
			
 
				+	reserve(&reader.field_indices, 0);
			
 
				+	reserve(&reader.last_record,   0);
			
 
				+	bufio.reader_init(&reader.r, r);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+// reader_init_with_string initializes a new Reader from s
			
 
				+reader_init_with_string :: proc(reader: ^Reader, s: string, buffer_allocator := context.allocator) {
			
 
				+	strings.reader_init(&reader.sr, s);
			
 
				+	r, _ := io.to_reader(strings.reader_to_stream(&reader.sr));
			
 
				+	reader_init(reader, r, buffer_allocator);
			
 
				+}
			
 
				+
			
 
				+// reader_destroy destroys a Reader
			
 
				+reader_destroy :: proc(r: ^Reader) {
			
 
				+	delete(r.raw_buffer);
			
 
				+	delete(r.record_buffer);
			
 
				+	delete(r.field_indices);
			
 
				+	delete(r.last_record);
			
 
				+	bufio.reader_destroy(&r.r);
			
 
				+}
			
 
				+
			
 
				+// read reads a single record (a slice of fields) from r
			
 
				+//
			
 
				+// All \r\n sequences are normalized to \n, including multi-line field
			
 
				+read :: proc(r: ^Reader, allocator := context.allocator) -> (record: []string, err: Error) {
			
 
				+	if r.reuse_record {
			
 
				+		record, err = _read_record(r, &r.last_record, allocator);
			
 
				+		resize(&r.last_record, len(record));
			
 
				+		copy(r.last_record[:], record);
			
 
				+	} else {
			
 
				+		record, err = _read_record(r, nil, allocator);
			
 
				+	}
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+// is_io_error checks where an Error is a specific io.Error kind
			
 
				+is_io_error :: proc(err: Error, io_err: io.Error) -> bool {
			
 
				+	if v, ok := err.(io.Error); ok {
			
 
				+		return v == io_err;
			
 
				+	}
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+// read_all reads all the remaining records from r.
			
 
				+// Each record is a slice of fields.
			
 
				+// read_all is defined to read until an EOF, and does not treat, and does not treat EOF as an error
			
 
				+read_all :: proc(r: ^Reader, allocator := context.allocator) -> ([][]string, Error) {
			
 
				+	context.allocator = allocator;
			
 
				+	records: [dynamic][]string;
			
 
				+	for {
			
 
				+		record, rerr := _read_record(r, nil, allocator);
			
 
				+		if is_io_error(rerr, .EOF) {
			
 
				+			return records[:], nil;
			
 
				+		}
			
 
				+		if rerr != nil {
			
 
				+			return nil, rerr;
			
 
				+		}
			
 
				+		append(&records, record);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// read reads a single record (a slice of fields) from the provided input.
			
 
				+read_from_string :: proc(input: string, record_allocator := context.allocator, buffer_allocator := context.allocator) -> (record: []string, n: int, err: Error) {
			
 
				+	ir: strings.Reader;
			
 
				+	strings.reader_init(&ir, input);
			
 
				+	input_reader, _ := io.to_reader(strings.reader_to_stream(&ir));
			
 
				+
			
 
				+	r: Reader;
			
 
				+	reader_init(&r, input_reader, buffer_allocator);
			
 
				+	defer reader_destroy(&r);
			
 
				+	record, err = read(&r, record_allocator);
			
 
				+	n = int(r.r.r);
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+// read_all reads all the remaining records from the provided input.
			
 
				+read_all_from_string :: proc(input: string, records_allocator := context.allocator, buffer_allocator := context.allocator) -> ([][]string, Error) {
			
 
				+	ir: strings.Reader;
			
 
				+	strings.reader_init(&ir, input);
			
 
				+	input_reader, _ := io.to_reader(strings.reader_to_stream(&ir));
			
 
				+
			
 
				+	r: Reader;
			
 
				+	reader_init(&r, input_reader, buffer_allocator);
			
 
				+	defer reader_destroy(&r);
			
 
				+	return read_all(&r, records_allocator);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+@private
			
 
				+_read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.allocator) -> ([]string, Error) {
			
 
				+	read_line :: proc(r: ^Reader) -> ([]byte, io.Error) {
			
 
				+		line, err := bufio.reader_read_slice(&r.r, '\n');
			
 
				+		if err == .Buffer_Full {
			
 
				+			clear(&r.raw_buffer);
			
 
				+			append(&r.raw_buffer, ..line);
			
 
				+			for err == .Buffer_Full {
			
 
				+				line, err = bufio.reader_read_slice(&r.r, '\n');
			
 
				+				append(&r.raw_buffer, ..line);
			
 
				+			}
			
 
				+			line = r.raw_buffer[:];
			
 
				+		}
			
 
				+		if len(line) > 0 && err == .EOF {
			
 
				+			err = nil;
			
 
				+			if line[len(line)-1] == '\r' {
			
 
				+				line = line[:len(line)-1];
			
 
				+			}
			
 
				+		}
			
 
				+		r.line_count += 1;
			
 
				+
			
 
				+		// normalize \r\n to \n
			
 
				+		n := len(line);
			
 
				+		for n >= 2 && string(line[n-2:]) == "\r\n" {
			
 
				+			line[n-2] = '\n';
			
 
				+			line = line[:n-1];
			
 
				+		}
			
 
				+
			
 
				+		return line, err;
			
 
				+	}
			
 
				+
			
 
				+	is_valid_delim :: proc(r: rune) -> bool {
			
 
				+		switch r {
			
 
				+		case 0, '"', '\r', '\n', utf8.RUNE_ERROR:
			
 
				+			return false;
			
 
				+		}
			
 
				+		return utf8.valid_rune(r);
			
 
				+	}
			
 
				+
			
 
				+	length_newline :: proc(b: []byte) -> int {
			
 
				+		if len(b) > 0 && b[len(b)-1] == '\n' {
			
 
				+			return 1;
			
 
				+		}
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	next_rune :: proc(b: []byte) -> rune {
			
 
				+		r, _ := utf8.decode_rune(b);
			
 
				+		return r;
			
 
				+	}
			
 
				+
			
 
				+	if r.comma == r.comment ||
			
 
				+	   !is_valid_delim(r.comma) ||
			
 
				+	   (r.comment != 0 && !is_valid_delim(r.comment)) {
			
 
				+		err := Parser_Error{
			
 
				+			kind = .Invalid_Delim,
			
 
				+			line = r.line_count,
			
 
				+		};
			
 
				+		return nil, err;
			
 
				+	}
			
 
				+
			
 
				+	line, full_line: []byte;
			
 
				+	err_read: io.Error;
			
 
				+	for err_read == nil {
			
 
				+		line, err_read = read_line(r);
			
 
				+		if r.comment != 0 && next_rune(line) == r.comment {
			
 
				+			line = nil;
			
 
				+			continue;
			
 
				+		}
			
 
				+		if err_read == nil && len(line) == length_newline(line) {
			
 
				+			line = nil;
			
 
				+			continue;
			
 
				+		}
			
 
				+		full_line = line;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	if is_io_error(err_read, .EOF) {
			
 
				+		return nil, err_read;
			
 
				+	}
			
 
				+
			
 
				+	err: Error;
			
 
				+	quote_len :: len(`"`);
			
 
				+	comma_len := utf8.rune_size(r.comma);
			
 
				+	record_line := r.line_count;
			
 
				+	clear(&r.record_buffer);
			
 
				+	clear(&r.field_indices);
			
 
				+
			
 
				+	parse_field: for {
			
 
				+		if r.trim_leading_space {
			
 
				+			line = bytes.trim_left_space(line);
			
 
				+		}
			
 
				+		if len(line) == 0 || line[0] != '"' {
			
 
				+			i := bytes.index_rune(line, r.comma);
			
 
				+			field := line;
			
 
				+			if i >= 0 {
			
 
				+				field = field[:i];
			
 
				+			} else {
			
 
				+				field = field[:len(field) - length_newline(field)];
			
 
				+			}
			
 
				+
			
 
				+			if !r.lazy_quotes {
			
 
				+				if j := bytes.index_byte(field, '"'); j >= 0 {
			
 
				+					column := utf8.rune_count(full_line[:len(full_line) - len(line[j:])]);
			
 
				+					err = Parser_Error{
			
 
				+						kind = .Bare_Quote,
			
 
				+						start_line = record_line,
			
 
				+						line = r.line_count,
			
 
				+						column = column,
			
 
				+					};
			
 
				+					break parse_field;
			
 
				+				}
			
 
				+			}
			
 
				+			append(&r.record_buffer, ..field);
			
 
				+			append(&r.field_indices, len(r.record_buffer));
			
 
				+			if i >= 0 {
			
 
				+				line = line[i+comma_len:];
			
 
				+				continue parse_field;
			
 
				+			}
			
 
				+			break parse_field;
			
 
				+
			
 
				+		} else {
			
 
				+			line = line[quote_len:];
			
 
				+			for {
			
 
				+				i := bytes.index_byte(line, '"');
			
 
				+				switch {
			
 
				+				case i >= 0:
			
 
				+					append(&r.record_buffer, ..line[:i]);
			
 
				+					line = line[i+quote_len:];
			
 
				+					switch ch := next_rune(line); {
			
 
				+					case ch == '"': // append quote
			
 
				+						append(&r.record_buffer, '"');
			
 
				+						line = line[quote_len:];
			
 
				+					case ch == r.comma: // end of field
			
 
				+						line = line[comma_len:];
			
 
				+						append(&r.field_indices, len(r.record_buffer));
			
 
				+						continue parse_field;
			
 
				+					case length_newline(line) == len(line): // end of line
			
 
				+						append(&r.field_indices, len(r.record_buffer));
			
 
				+						break parse_field;
			
 
				+					case r.lazy_quotes: // bare quote
			
 
				+						append(&r.record_buffer, '"');
			
 
				+					case: // invalid non-escaped quote
			
 
				+						column := utf8.rune_count(full_line[:len(full_line) - len(line) - quote_len]);
			
 
				+						err = Parser_Error{
			
 
				+							kind = .Quote,
			
 
				+							start_line = record_line,
			
 
				+							line = r.line_count,
			
 
				+							column = column,
			
 
				+						};
			
 
				+						break parse_field;
			
 
				+					}
			
 
				+
			
 
				+				case len(line) > 0:
			
 
				+					append(&r.record_buffer, ..line);
			
 
				+					if err_read != nil {
			
 
				+						break parse_field;
			
 
				+					}
			
 
				+					line, err_read = read_line(r);
			
 
				+					if is_io_error(err_read, .EOF) {
			
 
				+						err_read = nil;
			
 
				+					}
			
 
				+					full_line = line;
			
 
				+
			
 
				+				case:
			
 
				+					if !r.lazy_quotes && err_read == nil {
			
 
				+						column := utf8.rune_count(full_line);
			
 
				+						err = Parser_Error{
			
 
				+							kind = .Quote,
			
 
				+							start_line = record_line,
			
 
				+							line = r.line_count,
			
 
				+							column = column,
			
 
				+						};
			
 
				+						break parse_field;
			
 
				+					}
			
 
				+					append(&r.field_indices, len(r.record_buffer));
			
 
				+					break parse_field;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if err == nil && err_read != nil {
			
 
				+		err = err_read;
			
 
				+	}
			
 
				+
			
 
				+	context.allocator = allocator;
			
 
				+	dst := dst;
			
 
				+	str := string(r.record_buffer[:]);
			
 
				+	if dst == nil {
			
 
				+		// use local variable
			
 
				+		dst = &([dynamic]string){};
			
 
				+	}
			
 
				+	clear(dst);
			
 
				+	resize(dst, len(r.field_indices));
			
 
				+	pre_idx: int;
			
 
				+	for idx, i in r.field_indices {
			
 
				+		field := str[pre_idx:idx];
			
 
				+		if !r.reuse_record_buffer {
			
 
				+			field = strings.clone(field);
			
 
				+		}
			
 
				+		dst[i] = field;
			
 
				+		pre_idx = idx;
			
 
				+	}
			
 
				+
			
 
				+	if r.fields_per_record > 0 {
			
 
				+		if len(dst) != r.fields_per_record && err == nil {
			
 
				+			err = Parser_Error{
			
 
				+				kind = .Field_Count,
			
 
				+				start_line = record_line,
			
 
				+				line = r.line_count,
			
 
				+				expected = r.fields_per_record,
			
 
				+				got = len(dst),
			
 
				+			};
			
 
				+		}
			
 
				+	} else if r.fields_per_record == 0 {
			
 
				+		r.fields_per_record = len(dst);
			
 
				+	}
			
 
				+	return dst[:], err;
			
 
				+
			
 
				+}