reader.odin 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. // package csv reads and writes comma-separated values (CSV) files.
  2. // This package supports the format described in RFC 4180 <https://tools.ietf.org/html/rfc4180.html>
  3. package encoding_csv
  4. import "core:bufio"
  5. import "core:bytes"
  6. import "core:io"
  7. import "core:strings"
  8. import "core:unicode/utf8"
  9. // Reader is a data structure used for reading records from a CSV-encoded file
  10. //
  11. // The associated procedures for Reader expects its input to conform to RFC 4180.
  12. Reader :: struct {
  13. // comma is the field delimiter
  14. // reader_init will set it to be ','
  15. // A "comma" must be a valid rune, nor can it be \r, \n, or the Unicode replacement character (0xfffd)
  16. comma: rune,
  17. // comment, if not 0, is the comment character
  18. // Lines beginning with the comment character without a preceding whitespace are ignored
  19. comment: rune,
  20. // fields_per_record is the number of expected fields per record
  21. // if fields_per_record is >0, 'read' requires each record to have that field count
  22. // if fields_per_record is 0, 'read' sets it to the field count in the first record
  23. // if fields_per_record is <0, no check is made and records may have a variable field count
  24. fields_per_record: int,
  25. // If trim_leading_space is true, leading whitespace in a field is ignored
  26. // This is done even if the field delimiter (comma), is whitespace
  27. trim_leading_space: bool,
  28. // If lazy_quotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field
  29. lazy_quotes: bool,
  30. // multiline_fields, when set to true, will treat a field starting with a " as a multiline string
  31. // therefore, instead of reading until the next \n, it'll read until the next "
  32. multiline_fields: bool,
  33. // reuse_record controls whether calls to 'read' may return a slice using the backing buffer
  34. // for performance
  35. // By default, each call to 'read' returns a newly allocated slice
  36. reuse_record: bool,
  37. // reuse_record_buffer controls whether calls to 'read' clone the strings of each field or uses
  38. // the data stored in record buffer for performance
  39. // By default, each call to 'read' clones the strings of each field
  40. reuse_record_buffer: bool,
  41. // internal buffers
  42. r: bufio.Reader,
  43. line_count: int, // current line being read in the CSV file
  44. raw_buffer: [dynamic]byte,
  45. record_buffer: [dynamic]byte,
  46. field_indices: [dynamic]int,
  47. last_record: [dynamic]string,
  48. sr: strings.Reader, // used by reader_init_with_string
  49. // Set and used by the iterator. Query using `iterator_last_error`
  50. last_iterator_error: Error,
  51. }
  52. Reader_Error_Kind :: enum {
  53. Bare_Quote,
  54. Quote,
  55. Field_Count,
  56. Invalid_Delim,
  57. }
  58. reader_error_kind_string := [Reader_Error_Kind]string{
  59. .Bare_Quote = "bare \" in non-quoted field",
  60. .Quote = "extra or missing \" in quoted field",
  61. .Field_Count = "wrong field count",
  62. .Invalid_Delim = "invalid delimiter",
  63. }
  64. Reader_Error :: struct {
  65. kind: Reader_Error_Kind,
  66. start_line: int,
  67. line: int,
  68. column: int,
  69. expected, got: int, // used by .Field_Count
  70. }
  71. Error :: union {
  72. Reader_Error,
  73. io.Error,
  74. }
  75. DEFAULT_RECORD_BUFFER_CAPACITY :: 256
  76. // reader_init initializes a new Reader from r
  77. reader_init :: proc(reader: ^Reader, r: io.Reader, buffer_allocator := context.allocator) {
  78. switch reader.comma {
  79. case '\x00', '\n', '\r', 0xfffd:
  80. reader.comma = ','
  81. }
  82. context.allocator = buffer_allocator
  83. reserve(&reader.record_buffer, DEFAULT_RECORD_BUFFER_CAPACITY)
  84. reserve(&reader.raw_buffer, 0)
  85. reserve(&reader.field_indices, 0)
  86. reserve(&reader.last_record, 0)
  87. bufio.reader_init(&reader.r, r)
  88. }
  89. // reader_init_with_string initializes a new Reader from s
  90. reader_init_with_string :: proc(reader: ^Reader, s: string, buffer_allocator := context.allocator) {
  91. strings.reader_init(&reader.sr, s)
  92. r, _ := io.to_reader(strings.reader_to_stream(&reader.sr))
  93. reader_init(reader, r, buffer_allocator)
  94. }
  95. // reader_destroy destroys a Reader
  96. reader_destroy :: proc(r: ^Reader) {
  97. delete(r.raw_buffer)
  98. delete(r.record_buffer)
  99. delete(r.field_indices)
  100. delete(r.last_record)
  101. bufio.reader_destroy(&r.r)
  102. }
  103. /*
  104. Returns a record at a time.
  105. for record, row_idx in csv.iterator_next(&r) { ... }
  106. TIP: If you process the results within the loop and don't need to own the results,
  107. you can set the Reader's `reuse_record` and `reuse_record_reuse_record_buffer` to true;
  108. you won't need to delete the record or its fields.
  109. */
  110. iterator_next :: proc(r: ^Reader) -> (record: []string, idx: int, err: Error, more: bool) {
  111. record, r.last_iterator_error = read(r)
  112. return record, r.line_count - 1, r.last_iterator_error, r.last_iterator_error == nil
  113. }
  114. // Get last error if we the iterator
  115. iterator_last_error :: proc(r: Reader) -> (err: Error) {
  116. return r.last_iterator_error
  117. }
  118. // read reads a single record (a slice of fields) from r
  119. //
  120. // All \r\n sequences are normalized to \n, including multi-line field
  121. @(require_results)
  122. read :: proc(r: ^Reader, allocator := context.allocator) -> (record: []string, err: Error) {
  123. if r.reuse_record {
  124. record, err = _read_record(r, &r.last_record, allocator)
  125. resize(&r.last_record, len(record))
  126. copy(r.last_record[:], record)
  127. } else {
  128. record, err = _read_record(r, nil, allocator)
  129. }
  130. return
  131. }
  132. // is_io_error checks where an Error is a specific io.Error kind
  133. @(require_results)
  134. is_io_error :: proc(err: Error, io_err: io.Error) -> bool {
  135. if v, ok := err.(io.Error); ok {
  136. return v == io_err
  137. }
  138. return false
  139. }
  140. // read_all reads all the remaining records from r.
  141. // Each record is a slice of fields.
  142. // read_all is defined to read until an EOF, and does not treat, and does not treat EOF as an error
  143. @(require_results)
  144. read_all :: proc(r: ^Reader, allocator := context.allocator) -> ([][]string, Error) {
  145. context.allocator = allocator
  146. records: [dynamic][]string
  147. for {
  148. record, rerr := _read_record(r, nil, allocator)
  149. if is_io_error(rerr, .EOF) {
  150. return records[:], nil
  151. }
  152. if rerr != nil {
  153. // allow for a partial read
  154. if record != nil {
  155. append(&records, record)
  156. }
  157. return records[:], rerr
  158. }
  159. append(&records, record)
  160. }
  161. }
  162. // read reads a single record (a slice of fields) from the provided input.
  163. @(require_results)
  164. read_from_string :: proc(input: string, record_allocator := context.allocator, buffer_allocator := context.allocator) -> (record: []string, n: int, err: Error) {
  165. ir: strings.Reader
  166. strings.reader_init(&ir, input)
  167. input_reader, _ := io.to_reader(strings.reader_to_stream(&ir))
  168. r: Reader
  169. reader_init(&r, input_reader, buffer_allocator)
  170. defer reader_destroy(&r)
  171. record, err = read(&r, record_allocator)
  172. n = int(r.r.r)
  173. return
  174. }
  175. // read_all reads all the remaining records from the provided input.
  176. @(require_results)
  177. read_all_from_string :: proc(input: string, records_allocator := context.allocator, buffer_allocator := context.allocator) -> ([][]string, Error) {
  178. ir: strings.Reader
  179. strings.reader_init(&ir, input)
  180. input_reader, _ := io.to_reader(strings.reader_to_stream(&ir))
  181. r: Reader
  182. reader_init(&r, input_reader, buffer_allocator)
  183. defer reader_destroy(&r)
  184. return read_all(&r, records_allocator)
  185. }
  186. @(private, require_results)
  187. is_valid_delim :: proc(r: rune) -> bool {
  188. switch r {
  189. case 0, '"', '\r', '\n', utf8.RUNE_ERROR:
  190. return false
  191. }
  192. return utf8.valid_rune(r)
  193. }
  194. @(private, require_results)
  195. _read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.allocator) -> ([]string, Error) {
  196. @(require_results)
  197. read_line :: proc(r: ^Reader) -> ([]byte, io.Error) {
  198. if !r.multiline_fields {
  199. line, err := bufio.reader_read_slice(&r.r, '\n')
  200. if err == .Buffer_Full {
  201. clear(&r.raw_buffer)
  202. append(&r.raw_buffer, ..line)
  203. for err == .Buffer_Full {
  204. line, err = bufio.reader_read_slice(&r.r, '\n')
  205. append(&r.raw_buffer, ..line)
  206. }
  207. line = r.raw_buffer[:]
  208. }
  209. if len(line) > 0 && err == .EOF {
  210. err = nil
  211. if line[len(line)-1] == '\r' {
  212. line = line[:len(line)-1]
  213. }
  214. }
  215. r.line_count += 1
  216. // normalize \r\n to \n
  217. n := len(line)
  218. for n >= 2 && string(line[n-2:]) == "\r\n" {
  219. line[n-2] = '\n'
  220. line = line[:n-1]
  221. }
  222. return line, err
  223. } else {
  224. // Reading a "line" that can possibly contain multiline fields.
  225. // Unfortunately, this means we need to read a character at a time.
  226. err: io.Error
  227. cur: rune
  228. is_quoted: bool
  229. field_length := 0
  230. clear(&r.raw_buffer)
  231. read_loop: for err == .None {
  232. cur, _, err = bufio.reader_read_rune(&r.r)
  233. if err != .None { break read_loop }
  234. switch cur {
  235. case '"':
  236. is_quoted = field_length == 0
  237. field_length += 1
  238. case '\n', '\r':
  239. is_quoted or_break read_loop
  240. case r.comma:
  241. field_length = 0
  242. case:
  243. field_length += 1
  244. }
  245. rune_buf, rune_len := utf8.encode_rune(cur)
  246. append(&r.raw_buffer, ..rune_buf[:rune_len])
  247. }
  248. return r.raw_buffer[:], err
  249. }
  250. unreachable()
  251. }
  252. @(require_results)
  253. length_newline :: proc(b: []byte) -> int {
  254. if len(b) > 0 && b[len(b)-1] == '\n' {
  255. return 1
  256. }
  257. return 0
  258. }
  259. @(require_results)
  260. next_rune :: proc(b: []byte) -> rune {
  261. r, _ := utf8.decode_rune(b)
  262. return r
  263. }
  264. if r.comma == r.comment ||
  265. !is_valid_delim(r.comma) ||
  266. (r.comment != 0 && !is_valid_delim(r.comment)) {
  267. err := Reader_Error{
  268. kind = .Invalid_Delim,
  269. line = r.line_count,
  270. }
  271. return nil, err
  272. }
  273. line, full_line: []byte
  274. err_read: io.Error
  275. for err_read == nil {
  276. line, err_read = read_line(r)
  277. if r.comment != 0 && next_rune(line) == r.comment {
  278. line = nil
  279. continue
  280. }
  281. if err_read == nil && len(line) == length_newline(line) {
  282. line = nil
  283. continue
  284. }
  285. full_line = line
  286. break
  287. }
  288. if is_io_error(err_read, .EOF) {
  289. return nil, err_read
  290. }
  291. err: Error
  292. quote_len :: len(`"`)
  293. comma_len := utf8.rune_size(r.comma)
  294. record_line := r.line_count
  295. clear(&r.record_buffer)
  296. clear(&r.field_indices)
  297. parse_field: for {
  298. if r.trim_leading_space {
  299. line = bytes.trim_left_space(line)
  300. }
  301. if len(line) == 0 || line[0] != '"' {
  302. i := bytes.index_rune(line, r.comma)
  303. field := line
  304. if i >= 0 {
  305. field = field[:i]
  306. } else {
  307. field = field[:len(field) - length_newline(field)]
  308. }
  309. if !r.lazy_quotes {
  310. if j := bytes.index_byte(field, '"'); j >= 0 {
  311. column := utf8.rune_count(full_line[:len(full_line) - len(line[j:])])
  312. err = Reader_Error{
  313. kind = .Bare_Quote,
  314. start_line = record_line,
  315. line = r.line_count,
  316. column = column,
  317. }
  318. break parse_field
  319. }
  320. }
  321. append(&r.record_buffer, ..field)
  322. append(&r.field_indices, len(r.record_buffer))
  323. if i >= 0 {
  324. line = line[i+comma_len:]
  325. continue parse_field
  326. }
  327. break parse_field
  328. } else {
  329. line = line[quote_len:]
  330. for {
  331. i := bytes.index_byte(line, '"')
  332. switch {
  333. case i >= 0:
  334. append(&r.record_buffer, ..line[:i])
  335. line = line[i+quote_len:]
  336. switch ch := next_rune(line); {
  337. case ch == '"': // append quote
  338. append(&r.record_buffer, '"')
  339. line = line[quote_len:]
  340. case ch == r.comma: // end of field
  341. line = line[comma_len:]
  342. append(&r.field_indices, len(r.record_buffer))
  343. continue parse_field
  344. case length_newline(line) == len(line): // end of line
  345. append(&r.field_indices, len(r.record_buffer))
  346. break parse_field
  347. case r.lazy_quotes: // bare quote
  348. append(&r.record_buffer, '"')
  349. case: // invalid non-escaped quote
  350. column := utf8.rune_count(full_line[:len(full_line) - len(line) - quote_len])
  351. err = Reader_Error{
  352. kind = .Quote,
  353. start_line = record_line,
  354. line = r.line_count,
  355. column = column,
  356. }
  357. break parse_field
  358. }
  359. case len(line) > 0:
  360. append(&r.record_buffer, ..line)
  361. if err_read != nil {
  362. break parse_field
  363. }
  364. line, err_read = read_line(r)
  365. if is_io_error(err_read, .EOF) {
  366. err_read = nil
  367. }
  368. full_line = line
  369. case:
  370. if !r.lazy_quotes && err_read == nil {
  371. column := utf8.rune_count(full_line)
  372. err = Reader_Error{
  373. kind = .Quote,
  374. start_line = record_line,
  375. line = r.line_count,
  376. column = column,
  377. }
  378. break parse_field
  379. }
  380. append(&r.field_indices, len(r.record_buffer))
  381. break parse_field
  382. }
  383. }
  384. }
  385. }
  386. if err == nil && err_read != nil {
  387. err = err_read
  388. }
  389. context.allocator = allocator
  390. dst := dst
  391. str := string(r.record_buffer[:])
  392. if dst == nil {
  393. // use local variable
  394. dst = &([dynamic]string){}
  395. }
  396. clear(dst)
  397. resize(dst, len(r.field_indices))
  398. pre_idx: int
  399. for idx, i in r.field_indices {
  400. field := str[pre_idx:idx]
  401. if !r.reuse_record_buffer {
  402. field = strings.clone(field)
  403. }
  404. dst[i] = field
  405. pre_idx = idx
  406. }
  407. if r.fields_per_record > 0 {
  408. if len(dst) != r.fields_per_record && err == nil {
  409. err = Reader_Error{
  410. kind = .Field_Count,
  411. start_line = record_line,
  412. line = r.line_count,
  413. expected = r.fields_per_record,
  414. got = len(dst),
  415. }
  416. }
  417. } else if r.fields_per_record == 0 {
  418. r.fields_per_record = len(dst)
  419. }
  420. return dst[:], err
  421. }