Browse Source

Merge pull request #1785 from Kelimion/csv

Allow CSV/TSV reader to read multi-line fields.
Jeroen van Rijn 3 years ago
parent
commit
f50399e394
3 changed files with 67 additions and 21 deletions
  1. 2 0
      .gitignore
  2. 65 21
      core/encoding/csv/reader.odin
  3. BIN
      demo.bin

+ 2 - 0
.gitignore

@@ -269,6 +269,8 @@ bin/
 # - Linux/MacOS
 odin
 odin.dSYM
+*.bin
+demo.bin
 
 # shared collection
 shared/

+ 65 - 21
core/encoding/csv/reader.odin

@@ -34,6 +34,10 @@ Reader :: struct {
 	// If lazy_quotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field
 	lazy_quotes: bool,
 
+	// multiline_fields, when set to true, will treat a field starting with a " as a multiline string
+	// therefore, instead of reading until the next \n, it'll read until the next "
+	multiline_fields: bool,
+
 	// reuse_record controls whether calls to 'read' may return a slice using the backing buffer
 	// for performance
 	// By default, each call to 'read' returns a newly allocated slice
@@ -194,32 +198,72 @@ is_valid_delim :: proc(r: rune) -> bool {
 @private
 _read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.allocator) -> ([]string, Error) {
 	read_line :: proc(r: ^Reader) -> ([]byte, io.Error) {
-		line, err := bufio.reader_read_slice(&r.r, '\n')
-		if err == .Buffer_Full {
-			clear(&r.raw_buffer)
-			append(&r.raw_buffer, ..line)
-			for err == .Buffer_Full {
-				line, err = bufio.reader_read_slice(&r.r, '\n')
+		if !r.multiline_fields {
+			line, err := bufio.reader_read_slice(&r.r, '\n')
+			if err == .Buffer_Full {
+				clear(&r.raw_buffer)
 				append(&r.raw_buffer, ..line)
+				for err == .Buffer_Full {
+					line, err = bufio.reader_read_slice(&r.r, '\n')
+					append(&r.raw_buffer, ..line)
+				}
+				line = r.raw_buffer[:]
 			}
-			line = r.raw_buffer[:]
-		}
-		if len(line) > 0 && err == .EOF {
-			err = nil
-			if line[len(line)-1] == '\r' {
-				line = line[:len(line)-1]
+			if len(line) > 0 && err == .EOF {
+				err = nil
+				if line[len(line)-1] == '\r' {
+					line = line[:len(line)-1]
+				}
 			}
-		}
-		r.line_count += 1
+			r.line_count += 1
 
-		// normalize \r\n to \n
-		n := len(line)
-		for n >= 2 && string(line[n-2:]) == "\r\n" {
-			line[n-2] = '\n'
-			line = line[:n-1]
-		}
+			// normalize \r\n to \n
+			n := len(line)
+			for n >= 2 && string(line[n-2:]) == "\r\n" {
+				line[n-2] = '\n'
+				line = line[:n-1]
+			}
+			return line, err
+
+		} else {
+			// Reading a "line" that can possibly contain multiline fields.
+			// Unfortunately, this means we need to read a character at a time.
 
-		return line, err
+			err:       io.Error
+			cur:       rune
+			is_quoted: bool
+
+			field_length := 0
+
+			clear(&r.raw_buffer)
+
+			read_loop: for err == .None {
+				cur, _, err = bufio.reader_read_rune(&r.r)
+
+				if err != .None { break read_loop }
+
+				switch cur {
+				case '"':
+					is_quoted = field_length == 0
+					field_length += 1
+
+				case '\n', '\r':
+					if !is_quoted { break read_loop }
+
+				case r.comma:
+					field_length = 0
+
+				case:
+					field_length += 1
+				}
+
+				rune_buf, rune_len := utf8.encode_rune(cur)
+				append(&r.raw_buffer, ..rune_buf[:rune_len])
+			}
+
+			return r.raw_buffer[:], err
+		}
+		unreachable()
 	}
 
 	length_newline :: proc(b: []byte) -> int {

BIN
demo.bin