reader.odin 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. // package csv reads and writes comma-separated values (CSV) files.
  2. // This package supports the format described in [[ RFC 4180; https://tools.ietf.org/html/rfc4180.html ]]
  3. package encoding_csv
  4. import "core:bufio"
  5. import "core:bytes"
  6. import "core:io"
  7. import "core:strings"
  8. import "core:unicode/utf8"
  9. // Reader is a data structure used for reading records from a CSV-encoded file
  10. //
  11. // The associated procedures for Reader expects its input to conform to RFC 4180.
  12. Reader :: struct {
  13. // comma is the field delimiter
  14. // reader_init will set it to be ','
  15. // A "comma" must be a valid rune, nor can it be \r, \n, or the Unicode replacement character (0xfffd)
  16. comma: rune,
  17. // comment, if not 0, is the comment character
  18. // Lines beginning with the comment character without a preceding whitespace are ignored
  19. comment: rune,
  20. // fields_per_record is the number of expected fields per record
  21. // if fields_per_record is >0, 'read' requires each record to have that field count
  22. // if fields_per_record is 0, 'read' sets it to the field count in the first record
  23. // if fields_per_record is <0, no check is made and records may have a variable field count
  24. fields_per_record: int,
  25. // If trim_leading_space is true, leading whitespace in a field is ignored
  26. // This is done even if the field delimiter (comma), is whitespace
  27. trim_leading_space: bool,
  28. // If lazy_quotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field
  29. lazy_quotes: bool,
  30. // multiline_fields, when set to true, will treat a field starting with a " as a multiline string
  31. // therefore, instead of reading until the next \n, it'll read until the next "
  32. multiline_fields: bool,
  33. // reuse_record controls whether calls to 'read' may return a slice using the backing buffer
  34. // for performance
  35. // By default, each call to 'read' returns a newly allocated slice
  36. reuse_record: bool,
  37. // reuse_record_buffer controls whether calls to 'read' clone the strings of each field or uses
  38. // the data stored in record buffer for performance
  39. // By default, each call to 'read' clones the strings of each field
  40. reuse_record_buffer: bool,
  41. // internal buffers
  42. r: bufio.Reader,
  43. line_count: int, // current line being read in the CSV file
  44. raw_buffer: [dynamic]byte,
  45. record_buffer: [dynamic]byte,
  46. field_indices: [dynamic]int,
  47. last_record: [dynamic]string,
  48. sr: strings.Reader, // used by reader_init_with_string
  49. // Set and used by the iterator. Query using `iterator_last_error`
  50. last_iterator_error: Error,
  51. }
  52. Reader_Error_Kind :: enum {
  53. Bare_Quote,
  54. Quote,
  55. Field_Count,
  56. Invalid_Delim,
  57. }
  58. reader_error_kind_string := [Reader_Error_Kind]string{
  59. .Bare_Quote = "bare \" in non-quoted field",
  60. .Quote = "extra or missing \" in quoted field",
  61. .Field_Count = "wrong field count",
  62. .Invalid_Delim = "invalid delimiter",
  63. }
  64. Reader_Error :: struct {
  65. kind: Reader_Error_Kind,
  66. start_line: int,
  67. line: int,
  68. column: int,
  69. expected, got: int, // used by .Field_Count
  70. }
  71. Error :: union {
  72. Reader_Error,
  73. io.Error,
  74. }
  75. DEFAULT_RECORD_BUFFER_CAPACITY :: 256
  76. // reader_init initializes a new Reader from r
  77. reader_init :: proc(reader: ^Reader, r: io.Reader, buffer_allocator := context.allocator) {
  78. switch reader.comma {
  79. case '\x00', '\n', '\r', 0xfffd:
  80. reader.comma = ','
  81. }
  82. context.allocator = buffer_allocator
  83. reserve(&reader.record_buffer, DEFAULT_RECORD_BUFFER_CAPACITY)
  84. reserve(&reader.raw_buffer, 0)
  85. reserve(&reader.field_indices, 0)
  86. reserve(&reader.last_record, 0)
  87. bufio.reader_init(&reader.r, r)
  88. }
  89. // reader_init_with_string initializes a new Reader from s
  90. reader_init_with_string :: proc(reader: ^Reader, s: string, buffer_allocator := context.allocator) {
  91. strings.reader_init(&reader.sr, s)
  92. r, _ := io.to_reader(strings.reader_to_stream(&reader.sr))
  93. reader_init(reader, r, buffer_allocator)
  94. }
  95. // reader_destroy destroys a Reader
  96. reader_destroy :: proc(r: ^Reader) {
  97. delete(r.raw_buffer)
  98. delete(r.record_buffer)
  99. delete(r.field_indices)
  100. delete(r.last_record)
  101. bufio.reader_destroy(&r.r)
  102. }
  103. /*
  104. Returns a record at a time.
  105. for record, row_idx in csv.iterator_next(&r) { ... }
  106. TIP: If you process the results within the loop and don't need to own the results,
  107. you can set the Reader's `reuse_record` and `reuse_record_reuse_record_buffer` to true;
  108. you won't need to delete the record or its fields.
  109. */
  110. iterator_next :: proc(r: ^Reader) -> (record: []string, idx: int, err: Error, more: bool) {
  111. record, r.last_iterator_error = read(r)
  112. return record, r.line_count - 1, r.last_iterator_error, r.last_iterator_error == nil
  113. }
  114. // Get last CSV parse error if we ignored it in the iterator loop
  115. //
  116. // for record, row_idx in csv.iterator_next(&r) { ... }
  117. iterator_last_error :: proc(r: Reader) -> (err: Error) {
  118. return r.last_iterator_error
  119. }
  120. // read reads a single record (a slice of fields) from r
  121. //
  122. // All \r\n sequences are normalized to \n, including multi-line field
  123. @(require_results)
  124. read :: proc(r: ^Reader, allocator := context.allocator) -> (record: []string, err: Error) {
  125. if r.reuse_record {
  126. record, err = _read_record(r, &r.last_record, allocator)
  127. resize(&r.last_record, len(record))
  128. copy(r.last_record[:], record)
  129. } else {
  130. record, err = _read_record(r, nil, allocator)
  131. }
  132. return
  133. }
  134. // is_io_error checks where an Error is a specific io.Error kind
  135. @(require_results)
  136. is_io_error :: proc(err: Error, io_err: io.Error) -> bool {
  137. if v, ok := err.(io.Error); ok {
  138. return v == io_err
  139. }
  140. return false
  141. }
  142. // read_all reads all the remaining records from r.
  143. // Each record is a slice of fields.
  144. // read_all is defined to read until an EOF, and does not treat EOF as an error
  145. @(require_results)
  146. read_all :: proc(r: ^Reader, allocator := context.allocator) -> ([][]string, Error) {
  147. context.allocator = allocator
  148. records: [dynamic][]string
  149. for {
  150. record, rerr := _read_record(r, nil, allocator)
  151. if is_io_error(rerr, .EOF) {
  152. return records[:], nil
  153. }
  154. if rerr != nil {
  155. // allow for a partial read
  156. if record != nil {
  157. append(&records, record)
  158. }
  159. return records[:], rerr
  160. }
  161. append(&records, record)
  162. }
  163. }
  164. // read reads a single record (a slice of fields) from the provided input.
  165. @(require_results)
  166. read_from_string :: proc(input: string, record_allocator := context.allocator, buffer_allocator := context.allocator) -> (record: []string, n: int, err: Error) {
  167. ir: strings.Reader
  168. strings.reader_init(&ir, input)
  169. input_reader, _ := io.to_reader(strings.reader_to_stream(&ir))
  170. r: Reader
  171. reader_init(&r, input_reader, buffer_allocator)
  172. defer reader_destroy(&r)
  173. record, err = read(&r, record_allocator)
  174. n = int(r.r.r)
  175. return
  176. }
  177. // read_all reads all the remaining records from the provided input.
  178. @(require_results)
  179. read_all_from_string :: proc(input: string, records_allocator := context.allocator, buffer_allocator := context.allocator) -> ([][]string, Error) {
  180. ir: strings.Reader
  181. strings.reader_init(&ir, input)
  182. input_reader, _ := io.to_reader(strings.reader_to_stream(&ir))
  183. r: Reader
  184. reader_init(&r, input_reader, buffer_allocator)
  185. defer reader_destroy(&r)
  186. return read_all(&r, records_allocator)
  187. }
  188. @(private, require_results)
  189. is_valid_delim :: proc(r: rune) -> bool {
  190. switch r {
  191. case 0, '"', '\r', '\n', utf8.RUNE_ERROR:
  192. return false
  193. }
  194. return utf8.valid_rune(r)
  195. }
  196. @(private, require_results)
  197. _read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.allocator) -> ([]string, Error) {
  198. @(require_results)
  199. read_line :: proc(r: ^Reader) -> ([]byte, io.Error) {
  200. if !r.multiline_fields {
  201. line, err := bufio.reader_read_slice(&r.r, '\n')
  202. if err == .Buffer_Full {
  203. clear(&r.raw_buffer)
  204. append(&r.raw_buffer, ..line)
  205. for err == .Buffer_Full {
  206. line, err = bufio.reader_read_slice(&r.r, '\n')
  207. append(&r.raw_buffer, ..line)
  208. }
  209. line = r.raw_buffer[:]
  210. }
  211. if len(line) > 0 && err == .EOF {
  212. err = nil
  213. if line[len(line)-1] == '\r' {
  214. line = line[:len(line)-1]
  215. }
  216. }
  217. r.line_count += 1
  218. // normalize \r\n to \n
  219. n := len(line)
  220. for n >= 2 && string(line[n-2:]) == "\r\n" {
  221. line[n-2] = '\n'
  222. line = line[:n-1]
  223. }
  224. return line, err
  225. } else {
  226. // Reading a "line" that can possibly contain multiline fields.
  227. // Unfortunately, this means we need to read a character at a time.
  228. err: io.Error
  229. cur: rune
  230. is_quoted: bool
  231. field_length := 0
  232. clear(&r.raw_buffer)
  233. read_loop: for err == .None {
  234. cur, _, err = bufio.reader_read_rune(&r.r)
  235. if err != .None { break read_loop }
  236. switch cur {
  237. case '"':
  238. is_quoted = field_length == 0
  239. field_length += 1
  240. case '\n', '\r':
  241. is_quoted or_break read_loop
  242. case r.comma:
  243. field_length = 0
  244. case:
  245. field_length += 1
  246. }
  247. rune_buf, rune_len := utf8.encode_rune(cur)
  248. append(&r.raw_buffer, ..rune_buf[:rune_len])
  249. }
  250. return r.raw_buffer[:], err
  251. }
  252. unreachable()
  253. }
  254. @(require_results)
  255. length_newline :: proc(b: []byte) -> int {
  256. if len(b) > 0 && b[len(b)-1] == '\n' {
  257. return 1
  258. }
  259. return 0
  260. }
  261. @(require_results)
  262. next_rune :: proc(b: []byte) -> rune {
  263. r, _ := utf8.decode_rune(b)
  264. return r
  265. }
  266. if r.comma == r.comment ||
  267. !is_valid_delim(r.comma) ||
  268. (r.comment != 0 && !is_valid_delim(r.comment)) {
  269. err := Reader_Error{
  270. kind = .Invalid_Delim,
  271. line = r.line_count,
  272. }
  273. return nil, err
  274. }
  275. line, full_line: []byte
  276. err_read: io.Error
  277. for err_read == nil {
  278. line, err_read = read_line(r)
  279. if r.comment != 0 && next_rune(line) == r.comment {
  280. line = nil
  281. continue
  282. }
  283. if err_read == nil && len(line) == length_newline(line) {
  284. line = nil
  285. continue
  286. }
  287. full_line = line
  288. break
  289. }
  290. if is_io_error(err_read, .EOF) {
  291. return nil, err_read
  292. }
  293. err: Error
  294. quote_len :: len(`"`)
  295. comma_len := utf8.rune_size(r.comma)
  296. record_line := r.line_count
  297. clear(&r.record_buffer)
  298. clear(&r.field_indices)
  299. parse_field: for {
  300. if r.trim_leading_space {
  301. line = bytes.trim_left_space(line)
  302. }
  303. if len(line) == 0 || line[0] != '"' {
  304. i := bytes.index_rune(line, r.comma)
  305. field := line
  306. if i >= 0 {
  307. field = field[:i]
  308. } else {
  309. field = field[:len(field) - length_newline(field)]
  310. }
  311. if !r.lazy_quotes {
  312. if j := bytes.index_byte(field, '"'); j >= 0 {
  313. column := utf8.rune_count(full_line[:len(full_line) - len(line[j:])])
  314. err = Reader_Error{
  315. kind = .Bare_Quote,
  316. start_line = record_line,
  317. line = r.line_count,
  318. column = column,
  319. }
  320. break parse_field
  321. }
  322. }
  323. append(&r.record_buffer, ..field)
  324. append(&r.field_indices, len(r.record_buffer))
  325. if i >= 0 {
  326. line = line[i+comma_len:]
  327. continue parse_field
  328. }
  329. break parse_field
  330. } else {
  331. line = line[quote_len:]
  332. for {
  333. i := bytes.index_byte(line, '"')
  334. switch {
  335. case i >= 0:
  336. append(&r.record_buffer, ..line[:i])
  337. line = line[i+quote_len:]
  338. switch ch := next_rune(line); {
  339. case ch == '"': // append quote
  340. append(&r.record_buffer, '"')
  341. line = line[quote_len:]
  342. case ch == r.comma: // end of field
  343. line = line[comma_len:]
  344. append(&r.field_indices, len(r.record_buffer))
  345. continue parse_field
  346. case length_newline(line) == len(line): // end of line
  347. append(&r.field_indices, len(r.record_buffer))
  348. break parse_field
  349. case r.lazy_quotes: // bare quote
  350. append(&r.record_buffer, '"')
  351. case: // invalid non-escaped quote
  352. column := utf8.rune_count(full_line[:len(full_line) - len(line) - quote_len])
  353. err = Reader_Error{
  354. kind = .Quote,
  355. start_line = record_line,
  356. line = r.line_count,
  357. column = column,
  358. }
  359. break parse_field
  360. }
  361. case len(line) > 0:
  362. append(&r.record_buffer, ..line)
  363. if err_read != nil {
  364. break parse_field
  365. }
  366. line, err_read = read_line(r)
  367. if is_io_error(err_read, .EOF) {
  368. err_read = nil
  369. }
  370. full_line = line
  371. case:
  372. if !r.lazy_quotes && err_read == nil {
  373. column := utf8.rune_count(full_line)
  374. err = Reader_Error{
  375. kind = .Quote,
  376. start_line = record_line,
  377. line = r.line_count,
  378. column = column,
  379. }
  380. break parse_field
  381. }
  382. append(&r.field_indices, len(r.record_buffer))
  383. break parse_field
  384. }
  385. }
  386. }
  387. }
  388. if err == nil && err_read != nil {
  389. err = err_read
  390. }
  391. context.allocator = allocator
  392. dst := dst
  393. str := string(r.record_buffer[:])
  394. if dst == nil {
  395. // use local variable
  396. dst = &([dynamic]string){}
  397. }
  398. clear(dst)
  399. resize(dst, len(r.field_indices))
  400. pre_idx: int
  401. for idx, i in r.field_indices {
  402. field := str[pre_idx:idx]
  403. if !r.reuse_record_buffer {
  404. field = strings.clone(field)
  405. }
  406. dst[i] = field
  407. pre_idx = idx
  408. }
  409. if r.fields_per_record > 0 {
  410. if len(dst) != r.fields_per_record && err == nil {
  411. err = Reader_Error{
  412. kind = .Field_Count,
  413. start_line = record_line,
  414. line = r.line_count,
  415. expected = r.fields_per_record,
  416. got = len(dst),
  417. }
  418. }
  419. } else if r.fields_per_record == 0 {
  420. r.fields_per_record = len(dst)
  421. }
  422. return dst[:], err
  423. }