123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406 |
- // package csv reads and writes comma-separated values (CSV) files.
- // This package supports the format described in RFC 4180 <https://tools.ietf.org/html/rfc4180.html>
- package csv
- import "core:bufio"
- import "core:bytes"
- import "core:io"
- import "core:strings"
- import "core:unicode/utf8"
- // Reader is a data structure used for reading records from a CSV-encoded file
- //
- // The associated procedures for Reader expects its input to conform to RFC 4180.
- Reader :: struct {
- // comma is the field delimiter
- // reader_init will set it to be ','
- // A "comma" must be a valid rune, nor can it be \r, \n, or the Unicode replacement character (0xfffd)
- comma: rune,
- // comment, if not 0, is the comment character
- // Lines beginning with the comment character without a preceding whitespace are ignored
- comment: rune,
- // fields_per_record is the number of expected fields per record
- // if fields_per_record is >0, 'read' requires each record to have that field count
- // if fields_per_record is 0, 'read' sets it to the field count in the first record
- // if fields_per_record is <0, no check is made and records may have a variable field count
- fields_per_record: int,
- // If trim_leading_space is true, leading whitespace in a field is ignored
- // This is done even if the field delimiter (comma), is whitespace
- trim_leading_space: bool,
- // If lazy_quotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field
- lazy_quotes: bool,
- // reuse_record controls whether calls to 'read' may return a slice using the backing buffer
- // for performance
- // By default, each call to 'read' returns a newly allocated slice
- reuse_record: bool,
- // reuse_record_buffer controls whether calls to 'read' clone the strings of each field or uses
- // the data stored in record buffer for performance
- // By default, each call to 'read' clones the strings of each field
- reuse_record_buffer: bool,
- // internal buffers
- r: bufio.Reader,
- line_count: int, // current line being read in the CSV file
- raw_buffer: [dynamic]byte,
- record_buffer: [dynamic]byte,
- field_indices: [dynamic]int,
- last_record: [dynamic]string,
- sr: strings.Reader, // used by reader_init_with_string
- }
- Reader_Error_Kind :: enum {
- Bare_Quote,
- Quote,
- Field_Count,
- Invalid_Delim,
- }
- reader_error_kind_string := [Reader_Error_Kind]string{
- .Bare_Quote = "bare \" in non-quoted field",
- .Quote = "extra or missing \" in quoted field",
- .Field_Count = "wrong field count",
- .Invalid_Delim = "invalid delimiter",
- }
- Reader_Error :: struct {
- kind: Reader_Error_Kind,
- start_line: int,
- line: int,
- column: int,
- expected, got: int, // used by .Field_Count
- }
- Error :: union {
- Reader_Error,
- io.Error,
- }
- DEFAULT_RECORD_BUFFER_CAPACITY :: 256
- // reader_init initializes a new Reader from r
- reader_init :: proc(reader: ^Reader, r: io.Reader, buffer_allocator := context.allocator) {
- reader.comma = ','
- context.allocator = buffer_allocator
- reserve(&reader.record_buffer, DEFAULT_RECORD_BUFFER_CAPACITY)
- reserve(&reader.raw_buffer, 0)
- reserve(&reader.field_indices, 0)
- reserve(&reader.last_record, 0)
- bufio.reader_init(&reader.r, r)
- }
- // reader_init_with_string initializes a new Reader from s
- reader_init_with_string :: proc(reader: ^Reader, s: string, buffer_allocator := context.allocator) {
- strings.reader_init(&reader.sr, s)
- r, _ := io.to_reader(strings.reader_to_stream(&reader.sr))
- reader_init(reader, r, buffer_allocator)
- }
- // reader_destroy destroys a Reader
- reader_destroy :: proc(r: ^Reader) {
- delete(r.raw_buffer)
- delete(r.record_buffer)
- delete(r.field_indices)
- delete(r.last_record)
- bufio.reader_destroy(&r.r)
- }
- // read reads a single record (a slice of fields) from r
- //
- // All \r\n sequences are normalized to \n, including multi-line field
- read :: proc(r: ^Reader, allocator := context.allocator) -> (record: []string, err: Error) {
- if r.reuse_record {
- record, err = _read_record(r, &r.last_record, allocator)
- resize(&r.last_record, len(record))
- copy(r.last_record[:], record)
- } else {
- record, err = _read_record(r, nil, allocator)
- }
- return
- }
- // is_io_error checks where an Error is a specific io.Error kind
- is_io_error :: proc(err: Error, io_err: io.Error) -> bool {
- if v, ok := err.(io.Error); ok {
- return v == io_err
- }
- return false
- }
- // read_all reads all the remaining records from r.
- // Each record is a slice of fields.
- // read_all is defined to read until an EOF, and does not treat, and does not treat EOF as an error
- read_all :: proc(r: ^Reader, allocator := context.allocator) -> ([][]string, Error) {
- context.allocator = allocator
- records: [dynamic][]string
- for {
- record, rerr := _read_record(r, nil, allocator)
- if is_io_error(rerr, .EOF) {
- return records[:], nil
- }
- if rerr != nil {
- return nil, rerr
- }
- append(&records, record)
- }
- }
- // read reads a single record (a slice of fields) from the provided input.
- read_from_string :: proc(input: string, record_allocator := context.allocator, buffer_allocator := context.allocator) -> (record: []string, n: int, err: Error) {
- ir: strings.Reader
- strings.reader_init(&ir, input)
- input_reader, _ := io.to_reader(strings.reader_to_stream(&ir))
- r: Reader
- reader_init(&r, input_reader, buffer_allocator)
- defer reader_destroy(&r)
- record, err = read(&r, record_allocator)
- n = int(r.r.r)
- return
- }
- // read_all reads all the remaining records from the provided input.
- read_all_from_string :: proc(input: string, records_allocator := context.allocator, buffer_allocator := context.allocator) -> ([][]string, Error) {
- ir: strings.Reader
- strings.reader_init(&ir, input)
- input_reader, _ := io.to_reader(strings.reader_to_stream(&ir))
- r: Reader
- reader_init(&r, input_reader, buffer_allocator)
- defer reader_destroy(&r)
- return read_all(&r, records_allocator)
- }
- @private
- is_valid_delim :: proc(r: rune) -> bool {
- switch r {
- case 0, '"', '\r', '\n', utf8.RUNE_ERROR:
- return false
- }
- return utf8.valid_rune(r)
- }
- @private
- _read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.allocator) -> ([]string, Error) {
- read_line :: proc(r: ^Reader) -> ([]byte, io.Error) {
- line, err := bufio.reader_read_slice(&r.r, '\n')
- if err == .Buffer_Full {
- clear(&r.raw_buffer)
- append(&r.raw_buffer, ..line)
- for err == .Buffer_Full {
- line, err = bufio.reader_read_slice(&r.r, '\n')
- append(&r.raw_buffer, ..line)
- }
- line = r.raw_buffer[:]
- }
- if len(line) > 0 && err == .EOF {
- err = nil
- if line[len(line)-1] == '\r' {
- line = line[:len(line)-1]
- }
- }
- r.line_count += 1
- // normalize \r\n to \n
- n := len(line)
- for n >= 2 && string(line[n-2:]) == "\r\n" {
- line[n-2] = '\n'
- line = line[:n-1]
- }
- return line, err
- }
- length_newline :: proc(b: []byte) -> int {
- if len(b) > 0 && b[len(b)-1] == '\n' {
- return 1
- }
- return 0
- }
- next_rune :: proc(b: []byte) -> rune {
- r, _ := utf8.decode_rune(b)
- return r
- }
- if r.comma == r.comment ||
- !is_valid_delim(r.comma) ||
- (r.comment != 0 && !is_valid_delim(r.comment)) {
- err := Reader_Error{
- kind = .Invalid_Delim,
- line = r.line_count,
- }
- return nil, err
- }
- line, full_line: []byte
- err_read: io.Error
- for err_read == nil {
- line, err_read = read_line(r)
- if r.comment != 0 && next_rune(line) == r.comment {
- line = nil
- continue
- }
- if err_read == nil && len(line) == length_newline(line) {
- line = nil
- continue
- }
- full_line = line
- break
- }
- if is_io_error(err_read, .EOF) {
- return nil, err_read
- }
- err: Error
- quote_len :: len(`"`)
- comma_len := utf8.rune_size(r.comma)
- record_line := r.line_count
- clear(&r.record_buffer)
- clear(&r.field_indices)
- parse_field: for {
- if r.trim_leading_space {
- line = bytes.trim_left_space(line)
- }
- if len(line) == 0 || line[0] != '"' {
- i := bytes.index_rune(line, r.comma)
- field := line
- if i >= 0 {
- field = field[:i]
- } else {
- field = field[:len(field) - length_newline(field)]
- }
- if !r.lazy_quotes {
- if j := bytes.index_byte(field, '"'); j >= 0 {
- column := utf8.rune_count(full_line[:len(full_line) - len(line[j:])])
- err = Reader_Error{
- kind = .Bare_Quote,
- start_line = record_line,
- line = r.line_count,
- column = column,
- }
- break parse_field
- }
- }
- append(&r.record_buffer, ..field)
- append(&r.field_indices, len(r.record_buffer))
- if i >= 0 {
- line = line[i+comma_len:]
- continue parse_field
- }
- break parse_field
- } else {
- line = line[quote_len:]
- for {
- i := bytes.index_byte(line, '"')
- switch {
- case i >= 0:
- append(&r.record_buffer, ..line[:i])
- line = line[i+quote_len:]
- switch ch := next_rune(line); {
- case ch == '"': // append quote
- append(&r.record_buffer, '"')
- line = line[quote_len:]
- case ch == r.comma: // end of field
- line = line[comma_len:]
- append(&r.field_indices, len(r.record_buffer))
- continue parse_field
- case length_newline(line) == len(line): // end of line
- append(&r.field_indices, len(r.record_buffer))
- break parse_field
- case r.lazy_quotes: // bare quote
- append(&r.record_buffer, '"')
- case: // invalid non-escaped quote
- column := utf8.rune_count(full_line[:len(full_line) - len(line) - quote_len])
- err = Reader_Error{
- kind = .Quote,
- start_line = record_line,
- line = r.line_count,
- column = column,
- }
- break parse_field
- }
- case len(line) > 0:
- append(&r.record_buffer, ..line)
- if err_read != nil {
- break parse_field
- }
- line, err_read = read_line(r)
- if is_io_error(err_read, .EOF) {
- err_read = nil
- }
- full_line = line
- case:
- if !r.lazy_quotes && err_read == nil {
- column := utf8.rune_count(full_line)
- err = Reader_Error{
- kind = .Quote,
- start_line = record_line,
- line = r.line_count,
- column = column,
- }
- break parse_field
- }
- append(&r.field_indices, len(r.record_buffer))
- break parse_field
- }
- }
- }
- }
- if err == nil && err_read != nil {
- err = err_read
- }
- context.allocator = allocator
- dst := dst
- str := string(r.record_buffer[:])
- if dst == nil {
- // use local variable
- dst = &([dynamic]string){}
- }
- clear(dst)
- resize(dst, len(r.field_indices))
- pre_idx: int
- for idx, i in r.field_indices {
- field := str[pre_idx:idx]
- if !r.reuse_record_buffer {
- field = strings.clone(field)
- }
- dst[i] = field
- pre_idx = idx
- }
- if r.fields_per_record > 0 {
- if len(dst) != r.fields_per_record && err == nil {
- err = Reader_Error{
- kind = .Field_Count,
- start_line = record_line,
- line = r.line_count,
- expected = r.fields_per_record,
- got = len(dst),
- }
- }
- } else if r.fields_per_record == 0 {
- r.fields_per_record = len(dst)
- }
- return dst[:], err
- }
|