reader.odin 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. // package csv reads and writes comma-separated values (CSV) files.
  2. // This package supports the format described in RFC 4180 <https://tools.ietf.org/html/rfc4180.html>
  3. package csv
  4. import "core:bufio"
  5. import "core:bytes"
  6. import "core:io"
  7. import "core:strings"
  8. import "core:unicode/utf8"
  9. // Reader is a data structure used for reading records from a CSV-encoded file
  10. //
  11. // The associated procedures for Reader expects its input to conform to RFC 4180.
  12. Reader :: struct {
  13. // comma is the field delimiter
  14. // reader_init will set it to be ','
  15. // A "comma" must be a valid rune, nor can it be \r, \n, or the Unicode replacement character (0xfffd)
  16. comma: rune,
  17. // comment, if not 0, is the comment character
  18. // Lines beginning with the comment character without a preceding whitespace are ignored
  19. comment: rune,
  20. // fields_per_record is the number of expected fields per record
  21. // if fields_per_record is >0, 'read' requires each record to have that field count
  22. // if fields_per_record is 0, 'read' sets it to the field count in the first record
  23. // if fields_per_record is <0, no check is made and records may have a variable field count
  24. fields_per_record: int,
  25. // If trim_leading_space is true, leading whitespace in a field is ignored
  26. // This is done even if the field delimiter (comma), is whitespace
  27. trim_leading_space: bool,
  28. // If lazy_quotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field
  29. lazy_quotes: bool,
  30. // multiline_fields, when set to true, will treat a field starting with a " as a multiline string
  31. // therefore, instead of reading until the next \n, it'll read until the next "
  32. multiline_fields: bool,
  33. // reuse_record controls whether calls to 'read' may return a slice using the backing buffer
  34. // for performance
  35. // By default, each call to 'read' returns a newly allocated slice
  36. reuse_record: bool,
  37. // reuse_record_buffer controls whether calls to 'read' clone the strings of each field or uses
  38. // the data stored in record buffer for performance
  39. // By default, each call to 'read' clones the strings of each field
  40. reuse_record_buffer: bool,
  41. // internal buffers
  42. r: bufio.Reader,
  43. line_count: int, // current line being read in the CSV file
  44. raw_buffer: [dynamic]byte,
  45. record_buffer: [dynamic]byte,
  46. field_indices: [dynamic]int,
  47. last_record: [dynamic]string,
  48. sr: strings.Reader, // used by reader_init_with_string
  49. }
  50. Reader_Error_Kind :: enum {
  51. Bare_Quote,
  52. Quote,
  53. Field_Count,
  54. Invalid_Delim,
  55. }
  56. reader_error_kind_string := [Reader_Error_Kind]string{
  57. .Bare_Quote = "bare \" in non-quoted field",
  58. .Quote = "extra or missing \" in quoted field",
  59. .Field_Count = "wrong field count",
  60. .Invalid_Delim = "invalid delimiter",
  61. }
  62. Reader_Error :: struct {
  63. kind: Reader_Error_Kind,
  64. start_line: int,
  65. line: int,
  66. column: int,
  67. expected, got: int, // used by .Field_Count
  68. }
  69. Error :: union {
  70. Reader_Error,
  71. io.Error,
  72. }
  73. DEFAULT_RECORD_BUFFER_CAPACITY :: 256
  74. // reader_init initializes a new Reader from r
  75. reader_init :: proc(reader: ^Reader, r: io.Reader, buffer_allocator := context.allocator) {
  76. reader.comma = ','
  77. context.allocator = buffer_allocator
  78. reserve(&reader.record_buffer, DEFAULT_RECORD_BUFFER_CAPACITY)
  79. reserve(&reader.raw_buffer, 0)
  80. reserve(&reader.field_indices, 0)
  81. reserve(&reader.last_record, 0)
  82. bufio.reader_init(&reader.r, r)
  83. }
  84. // reader_init_with_string initializes a new Reader from s
  85. reader_init_with_string :: proc(reader: ^Reader, s: string, buffer_allocator := context.allocator) {
  86. strings.reader_init(&reader.sr, s)
  87. r, _ := io.to_reader(strings.reader_to_stream(&reader.sr))
  88. reader_init(reader, r, buffer_allocator)
  89. }
  90. // reader_destroy destroys a Reader
  91. reader_destroy :: proc(r: ^Reader) {
  92. delete(r.raw_buffer)
  93. delete(r.record_buffer)
  94. delete(r.field_indices)
  95. delete(r.last_record)
  96. bufio.reader_destroy(&r.r)
  97. }
  98. // read reads a single record (a slice of fields) from r
  99. //
  100. // All \r\n sequences are normalized to \n, including multi-line field
  101. read :: proc(r: ^Reader, allocator := context.allocator) -> (record: []string, err: Error) {
  102. if r.reuse_record {
  103. record, err = _read_record(r, &r.last_record, allocator)
  104. resize(&r.last_record, len(record))
  105. copy(r.last_record[:], record)
  106. } else {
  107. record, err = _read_record(r, nil, allocator)
  108. }
  109. return
  110. }
  111. // is_io_error checks where an Error is a specific io.Error kind
  112. is_io_error :: proc(err: Error, io_err: io.Error) -> bool {
  113. if v, ok := err.(io.Error); ok {
  114. return v == io_err
  115. }
  116. return false
  117. }
  118. // read_all reads all the remaining records from r.
  119. // Each record is a slice of fields.
  120. // read_all is defined to read until an EOF, and does not treat, and does not treat EOF as an error
  121. read_all :: proc(r: ^Reader, allocator := context.allocator) -> ([][]string, Error) {
  122. context.allocator = allocator
  123. records: [dynamic][]string
  124. for {
  125. record, rerr := _read_record(r, nil, allocator)
  126. if is_io_error(rerr, .EOF) {
  127. return records[:], nil
  128. }
  129. if rerr != nil {
  130. return nil, rerr
  131. }
  132. append(&records, record)
  133. }
  134. }
  135. // read reads a single record (a slice of fields) from the provided input.
  136. read_from_string :: proc(input: string, record_allocator := context.allocator, buffer_allocator := context.allocator) -> (record: []string, n: int, err: Error) {
  137. ir: strings.Reader
  138. strings.reader_init(&ir, input)
  139. input_reader, _ := io.to_reader(strings.reader_to_stream(&ir))
  140. r: Reader
  141. reader_init(&r, input_reader, buffer_allocator)
  142. defer reader_destroy(&r)
  143. record, err = read(&r, record_allocator)
  144. n = int(r.r.r)
  145. return
  146. }
  147. // read_all reads all the remaining records from the provided input.
  148. read_all_from_string :: proc(input: string, records_allocator := context.allocator, buffer_allocator := context.allocator) -> ([][]string, Error) {
  149. ir: strings.Reader
  150. strings.reader_init(&ir, input)
  151. input_reader, _ := io.to_reader(strings.reader_to_stream(&ir))
  152. r: Reader
  153. reader_init(&r, input_reader, buffer_allocator)
  154. defer reader_destroy(&r)
  155. return read_all(&r, records_allocator)
  156. }
  157. @private
  158. is_valid_delim :: proc(r: rune) -> bool {
  159. switch r {
  160. case 0, '"', '\r', '\n', utf8.RUNE_ERROR:
  161. return false
  162. }
  163. return utf8.valid_rune(r)
  164. }
  165. @private
  166. _read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.allocator) -> ([]string, Error) {
  167. read_line :: proc(r: ^Reader) -> ([]byte, io.Error) {
  168. if !r.multiline_fields {
  169. line, err := bufio.reader_read_slice(&r.r, '\n')
  170. if err == .Buffer_Full {
  171. clear(&r.raw_buffer)
  172. append(&r.raw_buffer, ..line)
  173. for err == .Buffer_Full {
  174. line, err = bufio.reader_read_slice(&r.r, '\n')
  175. append(&r.raw_buffer, ..line)
  176. }
  177. line = r.raw_buffer[:]
  178. }
  179. if len(line) > 0 && err == .EOF {
  180. err = nil
  181. if line[len(line)-1] == '\r' {
  182. line = line[:len(line)-1]
  183. }
  184. }
  185. r.line_count += 1
  186. // normalize \r\n to \n
  187. n := len(line)
  188. for n >= 2 && string(line[n-2:]) == "\r\n" {
  189. line[n-2] = '\n'
  190. line = line[:n-1]
  191. }
  192. return line, err
  193. } else {
  194. // Reading a "line" that can possibly contain multiline fields.
  195. // Unfortunately, this means we need to read a character at a time.
  196. err: io.Error
  197. cur: rune
  198. is_quoted: bool
  199. field_length := 0
  200. clear(&r.raw_buffer)
  201. read_loop: for err == .None {
  202. cur, _, err = bufio.reader_read_rune(&r.r)
  203. if err != .None { break read_loop }
  204. switch cur {
  205. case '"':
  206. is_quoted = field_length == 0
  207. field_length += 1
  208. case '\n', '\r':
  209. is_quoted or_break read_loop
  210. case r.comma:
  211. field_length = 0
  212. case:
  213. field_length += 1
  214. }
  215. rune_buf, rune_len := utf8.encode_rune(cur)
  216. append(&r.raw_buffer, ..rune_buf[:rune_len])
  217. }
  218. return r.raw_buffer[:], err
  219. }
  220. unreachable()
  221. }
  222. length_newline :: proc(b: []byte) -> int {
  223. if len(b) > 0 && b[len(b)-1] == '\n' {
  224. return 1
  225. }
  226. return 0
  227. }
  228. next_rune :: proc(b: []byte) -> rune {
  229. r, _ := utf8.decode_rune(b)
  230. return r
  231. }
  232. if r.comma == r.comment ||
  233. !is_valid_delim(r.comma) ||
  234. (r.comment != 0 && !is_valid_delim(r.comment)) {
  235. err := Reader_Error{
  236. kind = .Invalid_Delim,
  237. line = r.line_count,
  238. }
  239. return nil, err
  240. }
  241. line, full_line: []byte
  242. err_read: io.Error
  243. for err_read == nil {
  244. line, err_read = read_line(r)
  245. if r.comment != 0 && next_rune(line) == r.comment {
  246. line = nil
  247. continue
  248. }
  249. if err_read == nil && len(line) == length_newline(line) {
  250. line = nil
  251. continue
  252. }
  253. full_line = line
  254. break
  255. }
  256. if is_io_error(err_read, .EOF) {
  257. return nil, err_read
  258. }
  259. err: Error
  260. quote_len :: len(`"`)
  261. comma_len := utf8.rune_size(r.comma)
  262. record_line := r.line_count
  263. clear(&r.record_buffer)
  264. clear(&r.field_indices)
  265. parse_field: for {
  266. if r.trim_leading_space {
  267. line = bytes.trim_left_space(line)
  268. }
  269. if len(line) == 0 || line[0] != '"' {
  270. i := bytes.index_rune(line, r.comma)
  271. field := line
  272. if i >= 0 {
  273. field = field[:i]
  274. } else {
  275. field = field[:len(field) - length_newline(field)]
  276. }
  277. if !r.lazy_quotes {
  278. if j := bytes.index_byte(field, '"'); j >= 0 {
  279. column := utf8.rune_count(full_line[:len(full_line) - len(line[j:])])
  280. err = Reader_Error{
  281. kind = .Bare_Quote,
  282. start_line = record_line,
  283. line = r.line_count,
  284. column = column,
  285. }
  286. break parse_field
  287. }
  288. }
  289. append(&r.record_buffer, ..field)
  290. append(&r.field_indices, len(r.record_buffer))
  291. if i >= 0 {
  292. line = line[i+comma_len:]
  293. continue parse_field
  294. }
  295. break parse_field
  296. } else {
  297. line = line[quote_len:]
  298. for {
  299. i := bytes.index_byte(line, '"')
  300. switch {
  301. case i >= 0:
  302. append(&r.record_buffer, ..line[:i])
  303. line = line[i+quote_len:]
  304. switch ch := next_rune(line); {
  305. case ch == '"': // append quote
  306. append(&r.record_buffer, '"')
  307. line = line[quote_len:]
  308. case ch == r.comma: // end of field
  309. line = line[comma_len:]
  310. append(&r.field_indices, len(r.record_buffer))
  311. continue parse_field
  312. case length_newline(line) == len(line): // end of line
  313. append(&r.field_indices, len(r.record_buffer))
  314. break parse_field
  315. case r.lazy_quotes: // bare quote
  316. append(&r.record_buffer, '"')
  317. case: // invalid non-escaped quote
  318. column := utf8.rune_count(full_line[:len(full_line) - len(line) - quote_len])
  319. err = Reader_Error{
  320. kind = .Quote,
  321. start_line = record_line,
  322. line = r.line_count,
  323. column = column,
  324. }
  325. break parse_field
  326. }
  327. case len(line) > 0:
  328. append(&r.record_buffer, ..line)
  329. if err_read != nil {
  330. break parse_field
  331. }
  332. line, err_read = read_line(r)
  333. if is_io_error(err_read, .EOF) {
  334. err_read = nil
  335. }
  336. full_line = line
  337. case:
  338. if !r.lazy_quotes && err_read == nil {
  339. column := utf8.rune_count(full_line)
  340. err = Reader_Error{
  341. kind = .Quote,
  342. start_line = record_line,
  343. line = r.line_count,
  344. column = column,
  345. }
  346. break parse_field
  347. }
  348. append(&r.field_indices, len(r.record_buffer))
  349. break parse_field
  350. }
  351. }
  352. }
  353. }
  354. if err == nil && err_read != nil {
  355. err = err_read
  356. }
  357. context.allocator = allocator
  358. dst := dst
  359. str := string(r.record_buffer[:])
  360. if dst == nil {
  361. // use local variable
  362. dst = &([dynamic]string){}
  363. }
  364. clear(dst)
  365. resize(dst, len(r.field_indices))
  366. pre_idx: int
  367. for idx, i in r.field_indices {
  368. field := str[pre_idx:idx]
  369. if !r.reuse_record_buffer {
  370. field = strings.clone(field)
  371. }
  372. dst[i] = field
  373. pre_idx = idx
  374. }
  375. if r.fields_per_record > 0 {
  376. if len(dst) != r.fields_per_record && err == nil {
  377. err = Reader_Error{
  378. kind = .Field_Count,
  379. start_line = record_line,
  380. line = r.line_count,
  381. expected = r.fields_per_record,
  382. got = len(dst),
  383. }
  384. }
  385. } else if r.fields_per_record == 0 {
  386. r.fields_per_record = len(dst)
  387. }
  388. return dst[:], err
  389. }