// package csv reads and writes comma-separated values (CSV) files. // This package supports the format described in RFC 4180 package csv import "core:bufio" import "core:bytes" import "core:io" import "core:strings" import "core:unicode/utf8" // Reader is a data structure used for reading records from a CSV-encoded file // // The associated procedures for Reader expects its input to conform to RFC 4180. Reader :: struct { // comma is the field delimiter // reader_init will set it to be ',' // A "comma" must be a valid rune, nor can it be \r, \n, or the Unicode replacement character (0xfffd) comma: rune, // comment, if not 0, is the comment character // Lines beginning with the comment character without a preceding whitespace are ignored comment: rune, // fields_per_record is the number of expected fields per record // if fields_per_record is >0, 'read' requires each record to have that field count // if fields_per_record is 0, 'read' sets it to the field count in the first record // if fields_per_record is <0, no check is made and records may have a variable field count fields_per_record: int, // If trim_leading_space is true, leading whitespace in a field is ignored // This is done even if the field delimiter (comma), is whitespace trim_leading_space: bool, // If lazy_quotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field lazy_quotes: bool, // reuse_record controls whether calls to 'read' may return a slice using the backing buffer // for performance // By default, each call to 'read' returns a newly allocated slice reuse_record: bool, // reuse_record_buffer controls whether calls to 'read' clone the strings of each field or uses // the data stored in record buffer for performance // By default, each call to 'read' clones the strings of each field reuse_record_buffer: bool, // internal buffers r: bufio.Reader, line_count: int, // current line being read in the CSV file raw_buffer: [dynamic]byte, record_buffer: [dynamic]byte, field_indices: [dynamic]int, last_record: [dynamic]string, sr: strings.Reader, // used by reader_init_with_string } Reader_Error_Kind :: enum { Bare_Quote, Quote, Field_Count, Invalid_Delim, } reader_error_kind_string := [Reader_Error_Kind]string{ .Bare_Quote = "bare \" in non-quoted field", .Quote = "extra or missing \" in quoted field", .Field_Count = "wrong field count", .Invalid_Delim = "invalid delimiter", }; Reader_Error :: struct { kind: Reader_Error_Kind, start_line: int, line: int, column: int, expected, got: int, // used by .Field_Count } Error :: union { Reader_Error, io.Error, } DEFAULT_RECORD_BUFFER_CAPACITY :: 256; // reader_init initializes a new Reader from r reader_init :: proc(reader: ^Reader, r: io.Reader, buffer_allocator := context.allocator) { reader.comma = ','; context.allocator = buffer_allocator; reserve(&reader.record_buffer, DEFAULT_RECORD_BUFFER_CAPACITY); reserve(&reader.raw_buffer, 0); reserve(&reader.field_indices, 0); reserve(&reader.last_record, 0); bufio.reader_init(&reader.r, r); } // reader_init_with_string initializes a new Reader from s reader_init_with_string :: proc(reader: ^Reader, s: string, buffer_allocator := context.allocator) { strings.reader_init(&reader.sr, s); r, _ := io.to_reader(strings.reader_to_stream(&reader.sr)); reader_init(reader, r, buffer_allocator); } // reader_destroy destroys a Reader reader_destroy :: proc(r: ^Reader) { delete(r.raw_buffer); delete(r.record_buffer); delete(r.field_indices); delete(r.last_record); bufio.reader_destroy(&r.r); } // read reads a single record (a slice of fields) from r // // All \r\n sequences are normalized to \n, including multi-line field read :: proc(r: ^Reader, allocator := context.allocator) -> (record: []string, err: Error) { if r.reuse_record { record, err = _read_record(r, &r.last_record, allocator); resize(&r.last_record, len(record)); copy(r.last_record[:], record); } else { record, err = _read_record(r, nil, allocator); } return; } // is_io_error checks where an Error is a specific io.Error kind is_io_error :: proc(err: Error, io_err: io.Error) -> bool { if v, ok := err.(io.Error); ok { return v == io_err; } return false; } // read_all reads all the remaining records from r. // Each record is a slice of fields. // read_all is defined to read until an EOF, and does not treat, and does not treat EOF as an error read_all :: proc(r: ^Reader, allocator := context.allocator) -> ([][]string, Error) { context.allocator = allocator; records: [dynamic][]string; for { record, rerr := _read_record(r, nil, allocator); if is_io_error(rerr, .EOF) { return records[:], nil; } if rerr != nil { return nil, rerr; } append(&records, record); } } // read reads a single record (a slice of fields) from the provided input. read_from_string :: proc(input: string, record_allocator := context.allocator, buffer_allocator := context.allocator) -> (record: []string, n: int, err: Error) { ir: strings.Reader; strings.reader_init(&ir, input); input_reader, _ := io.to_reader(strings.reader_to_stream(&ir)); r: Reader; reader_init(&r, input_reader, buffer_allocator); defer reader_destroy(&r); record, err = read(&r, record_allocator); n = int(r.r.r); return; } // read_all reads all the remaining records from the provided input. read_all_from_string :: proc(input: string, records_allocator := context.allocator, buffer_allocator := context.allocator) -> ([][]string, Error) { ir: strings.Reader; strings.reader_init(&ir, input); input_reader, _ := io.to_reader(strings.reader_to_stream(&ir)); r: Reader; reader_init(&r, input_reader, buffer_allocator); defer reader_destroy(&r); return read_all(&r, records_allocator); } @private is_valid_delim :: proc(r: rune) -> bool { switch r { case 0, '"', '\r', '\n', utf8.RUNE_ERROR: return false; } return utf8.valid_rune(r); } @private _read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.allocator) -> ([]string, Error) { read_line :: proc(r: ^Reader) -> ([]byte, io.Error) { line, err := bufio.reader_read_slice(&r.r, '\n'); if err == .Buffer_Full { clear(&r.raw_buffer); append(&r.raw_buffer, ..line); for err == .Buffer_Full { line, err = bufio.reader_read_slice(&r.r, '\n'); append(&r.raw_buffer, ..line); } line = r.raw_buffer[:]; } if len(line) > 0 && err == .EOF { err = nil; if line[len(line)-1] == '\r' { line = line[:len(line)-1]; } } r.line_count += 1; // normalize \r\n to \n n := len(line); for n >= 2 && string(line[n-2:]) == "\r\n" { line[n-2] = '\n'; line = line[:n-1]; } return line, err; } length_newline :: proc(b: []byte) -> int { if len(b) > 0 && b[len(b)-1] == '\n' { return 1; } return 0; } next_rune :: proc(b: []byte) -> rune { r, _ := utf8.decode_rune(b); return r; } if r.comma == r.comment || !is_valid_delim(r.comma) || (r.comment != 0 && !is_valid_delim(r.comment)) { err := Reader_Error{ kind = .Invalid_Delim, line = r.line_count, }; return nil, err; } line, full_line: []byte; err_read: io.Error; for err_read == nil { line, err_read = read_line(r); if r.comment != 0 && next_rune(line) == r.comment { line = nil; continue; } if err_read == nil && len(line) == length_newline(line) { line = nil; continue; } full_line = line; break; } if is_io_error(err_read, .EOF) { return nil, err_read; } err: Error; quote_len :: len(`"`); comma_len := utf8.rune_size(r.comma); record_line := r.line_count; clear(&r.record_buffer); clear(&r.field_indices); parse_field: for { if r.trim_leading_space { line = bytes.trim_left_space(line); } if len(line) == 0 || line[0] != '"' { i := bytes.index_rune(line, r.comma); field := line; if i >= 0 { field = field[:i]; } else { field = field[:len(field) - length_newline(field)]; } if !r.lazy_quotes { if j := bytes.index_byte(field, '"'); j >= 0 { column := utf8.rune_count(full_line[:len(full_line) - len(line[j:])]); err = Reader_Error{ kind = .Bare_Quote, start_line = record_line, line = r.line_count, column = column, }; break parse_field; } } append(&r.record_buffer, ..field); append(&r.field_indices, len(r.record_buffer)); if i >= 0 { line = line[i+comma_len:]; continue parse_field; } break parse_field; } else { line = line[quote_len:]; for { i := bytes.index_byte(line, '"'); switch { case i >= 0: append(&r.record_buffer, ..line[:i]); line = line[i+quote_len:]; switch ch := next_rune(line); { case ch == '"': // append quote append(&r.record_buffer, '"'); line = line[quote_len:]; case ch == r.comma: // end of field line = line[comma_len:]; append(&r.field_indices, len(r.record_buffer)); continue parse_field; case length_newline(line) == len(line): // end of line append(&r.field_indices, len(r.record_buffer)); break parse_field; case r.lazy_quotes: // bare quote append(&r.record_buffer, '"'); case: // invalid non-escaped quote column := utf8.rune_count(full_line[:len(full_line) - len(line) - quote_len]); err = Reader_Error{ kind = .Quote, start_line = record_line, line = r.line_count, column = column, }; break parse_field; } case len(line) > 0: append(&r.record_buffer, ..line); if err_read != nil { break parse_field; } line, err_read = read_line(r); if is_io_error(err_read, .EOF) { err_read = nil; } full_line = line; case: if !r.lazy_quotes && err_read == nil { column := utf8.rune_count(full_line); err = Reader_Error{ kind = .Quote, start_line = record_line, line = r.line_count, column = column, }; break parse_field; } append(&r.field_indices, len(r.record_buffer)); break parse_field; } } } } if err == nil && err_read != nil { err = err_read; } context.allocator = allocator; dst := dst; str := string(r.record_buffer[:]); if dst == nil { // use local variable dst = &([dynamic]string){}; } clear(dst); resize(dst, len(r.field_indices)); pre_idx: int; for idx, i in r.field_indices { field := str[pre_idx:idx]; if !r.reuse_record_buffer { field = strings.clone(field); } dst[i] = field; pre_idx = idx; } if r.fields_per_record > 0 { if len(dst) != r.fields_per_record && err == nil { err = Reader_Error{ kind = .Field_Count, start_line = record_line, line = r.line_count, expected = r.fields_per_record, got = len(dst), }; } } else if r.fields_per_record == 0 { r.fields_per_record = len(dst); } return dst[:], err; }