reader.odin 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. // package csv reads and writes comma-separated values (CSV) files.
  2. // This package supports the format described in RFC 4180 <https://tools.ietf.org/html/rfc4180.html>
  3. package csv
  4. import "core:bufio"
  5. import "core:bytes"
  6. import "core:io"
  7. import "core:strings"
  8. import "core:unicode/utf8"
  9. // Reader is a data structure used for reading records from a CSV-encoded file
  10. //
  11. // The associated procedures for Reader expects its input to conform to RFC 4180.
  12. Reader :: struct {
  13. // comma is the field delimiter
  14. // reader_init will set it to be ','
  15. // A "comma" must be a valid rune, nor can it be \r, \n, or the Unicode replacement character (0xfffd)
  16. comma: rune,
  17. // comment, if not 0, is the comment character
  18. // Lines beginning with the comment character without a preceding whitespace are ignored
  19. comment: rune,
  20. // fields_per_record is the number of expected fields per record
  21. // if fields_per_record is >0, 'read' requires each record to have that field count
  22. // if fields_per_record is 0, 'read' sets it to the field count in the first record
  23. // if fields_per_record is <0, no check is made and records may have a variable field count
  24. fields_per_record: int,
  25. // If trim_leading_space is true, leading whitespace in a field is ignored
  26. // This is done even if the field delimiter (comma), is whitespace
  27. trim_leading_space: bool,
  28. // If lazy_quotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field
  29. lazy_quotes: bool,
  30. // reuse_record controls whether calls to 'read' may return a slice using the backing buffer
  31. // for performance
  32. // By default, each call to 'read' returns a newly allocated slice
  33. reuse_record: bool,
  34. // reuse_record_buffer controls whether calls to 'read' clone the strings of each field or uses
  35. // the data stored in record buffer for performance
  36. // By default, each call to 'read' clones the strings of each field
  37. reuse_record_buffer: bool,
  38. // internal buffers
  39. r: bufio.Reader,
  40. line_count: int, // current line being read in the CSV file
  41. raw_buffer: [dynamic]byte,
  42. record_buffer: [dynamic]byte,
  43. field_indices: [dynamic]int,
  44. last_record: [dynamic]string,
  45. sr: strings.Reader, // used by reader_init_with_string
  46. }
  47. Reader_Error_Kind :: enum {
  48. Bare_Quote,
  49. Quote,
  50. Field_Count,
  51. Invalid_Delim,
  52. }
  53. reader_error_kind_string := [Reader_Error_Kind]string{
  54. .Bare_Quote = "bare \" in non-quoted field",
  55. .Quote = "extra or missing \" in quoted field",
  56. .Field_Count = "wrong field count",
  57. .Invalid_Delim = "invalid delimiter",
  58. };
  59. Reader_Error :: struct {
  60. kind: Reader_Error_Kind,
  61. start_line: int,
  62. line: int,
  63. column: int,
  64. expected, got: int, // used by .Field_Count
  65. }
  66. Error :: union {
  67. Reader_Error,
  68. io.Error,
  69. }
  70. DEFAULT_RECORD_BUFFER_CAPACITY :: 256;
  71. // reader_init initializes a new Reader from r
  72. reader_init :: proc(reader: ^Reader, r: io.Reader, buffer_allocator := context.allocator) {
  73. reader.comma = ',';
  74. context.allocator = buffer_allocator;
  75. reserve(&reader.record_buffer, DEFAULT_RECORD_BUFFER_CAPACITY);
  76. reserve(&reader.raw_buffer, 0);
  77. reserve(&reader.field_indices, 0);
  78. reserve(&reader.last_record, 0);
  79. bufio.reader_init(&reader.r, r);
  80. }
  81. // reader_init_with_string initializes a new Reader from s
  82. reader_init_with_string :: proc(reader: ^Reader, s: string, buffer_allocator := context.allocator) {
  83. strings.reader_init(&reader.sr, s);
  84. r, _ := io.to_reader(strings.reader_to_stream(&reader.sr));
  85. reader_init(reader, r, buffer_allocator);
  86. }
  87. // reader_destroy destroys a Reader
  88. reader_destroy :: proc(r: ^Reader) {
  89. delete(r.raw_buffer);
  90. delete(r.record_buffer);
  91. delete(r.field_indices);
  92. delete(r.last_record);
  93. bufio.reader_destroy(&r.r);
  94. }
  95. // read reads a single record (a slice of fields) from r
  96. //
  97. // All \r\n sequences are normalized to \n, including multi-line field
  98. read :: proc(r: ^Reader, allocator := context.allocator) -> (record: []string, err: Error) {
  99. if r.reuse_record {
  100. record, err = _read_record(r, &r.last_record, allocator);
  101. resize(&r.last_record, len(record));
  102. copy(r.last_record[:], record);
  103. } else {
  104. record, err = _read_record(r, nil, allocator);
  105. }
  106. return;
  107. }
  108. // is_io_error checks where an Error is a specific io.Error kind
  109. is_io_error :: proc(err: Error, io_err: io.Error) -> bool {
  110. if v, ok := err.(io.Error); ok {
  111. return v == io_err;
  112. }
  113. return false;
  114. }
  115. // read_all reads all the remaining records from r.
  116. // Each record is a slice of fields.
  117. // read_all is defined to read until an EOF, and does not treat, and does not treat EOF as an error
  118. read_all :: proc(r: ^Reader, allocator := context.allocator) -> ([][]string, Error) {
  119. context.allocator = allocator;
  120. records: [dynamic][]string;
  121. for {
  122. record, rerr := _read_record(r, nil, allocator);
  123. if is_io_error(rerr, .EOF) {
  124. return records[:], nil;
  125. }
  126. if rerr != nil {
  127. return nil, rerr;
  128. }
  129. append(&records, record);
  130. }
  131. }
  132. // read reads a single record (a slice of fields) from the provided input.
  133. read_from_string :: proc(input: string, record_allocator := context.allocator, buffer_allocator := context.allocator) -> (record: []string, n: int, err: Error) {
  134. ir: strings.Reader;
  135. strings.reader_init(&ir, input);
  136. input_reader, _ := io.to_reader(strings.reader_to_stream(&ir));
  137. r: Reader;
  138. reader_init(&r, input_reader, buffer_allocator);
  139. defer reader_destroy(&r);
  140. record, err = read(&r, record_allocator);
  141. n = int(r.r.r);
  142. return;
  143. }
  144. // read_all reads all the remaining records from the provided input.
  145. read_all_from_string :: proc(input: string, records_allocator := context.allocator, buffer_allocator := context.allocator) -> ([][]string, Error) {
  146. ir: strings.Reader;
  147. strings.reader_init(&ir, input);
  148. input_reader, _ := io.to_reader(strings.reader_to_stream(&ir));
  149. r: Reader;
  150. reader_init(&r, input_reader, buffer_allocator);
  151. defer reader_destroy(&r);
  152. return read_all(&r, records_allocator);
  153. }
  154. @private
  155. is_valid_delim :: proc(r: rune) -> bool {
  156. switch r {
  157. case 0, '"', '\r', '\n', utf8.RUNE_ERROR:
  158. return false;
  159. }
  160. return utf8.valid_rune(r);
  161. }
  162. @private
  163. _read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.allocator) -> ([]string, Error) {
  164. read_line :: proc(r: ^Reader) -> ([]byte, io.Error) {
  165. line, err := bufio.reader_read_slice(&r.r, '\n');
  166. if err == .Buffer_Full {
  167. clear(&r.raw_buffer);
  168. append(&r.raw_buffer, ..line);
  169. for err == .Buffer_Full {
  170. line, err = bufio.reader_read_slice(&r.r, '\n');
  171. append(&r.raw_buffer, ..line);
  172. }
  173. line = r.raw_buffer[:];
  174. }
  175. if len(line) > 0 && err == .EOF {
  176. err = nil;
  177. if line[len(line)-1] == '\r' {
  178. line = line[:len(line)-1];
  179. }
  180. }
  181. r.line_count += 1;
  182. // normalize \r\n to \n
  183. n := len(line);
  184. for n >= 2 && string(line[n-2:]) == "\r\n" {
  185. line[n-2] = '\n';
  186. line = line[:n-1];
  187. }
  188. return line, err;
  189. }
  190. length_newline :: proc(b: []byte) -> int {
  191. if len(b) > 0 && b[len(b)-1] == '\n' {
  192. return 1;
  193. }
  194. return 0;
  195. }
  196. next_rune :: proc(b: []byte) -> rune {
  197. r, _ := utf8.decode_rune(b);
  198. return r;
  199. }
  200. if r.comma == r.comment ||
  201. !is_valid_delim(r.comma) ||
  202. (r.comment != 0 && !is_valid_delim(r.comment)) {
  203. err := Reader_Error{
  204. kind = .Invalid_Delim,
  205. line = r.line_count,
  206. };
  207. return nil, err;
  208. }
  209. line, full_line: []byte;
  210. err_read: io.Error;
  211. for err_read == nil {
  212. line, err_read = read_line(r);
  213. if r.comment != 0 && next_rune(line) == r.comment {
  214. line = nil;
  215. continue;
  216. }
  217. if err_read == nil && len(line) == length_newline(line) {
  218. line = nil;
  219. continue;
  220. }
  221. full_line = line;
  222. break;
  223. }
  224. if is_io_error(err_read, .EOF) {
  225. return nil, err_read;
  226. }
  227. err: Error;
  228. quote_len :: len(`"`);
  229. comma_len := utf8.rune_size(r.comma);
  230. record_line := r.line_count;
  231. clear(&r.record_buffer);
  232. clear(&r.field_indices);
  233. parse_field: for {
  234. if r.trim_leading_space {
  235. line = bytes.trim_left_space(line);
  236. }
  237. if len(line) == 0 || line[0] != '"' {
  238. i := bytes.index_rune(line, r.comma);
  239. field := line;
  240. if i >= 0 {
  241. field = field[:i];
  242. } else {
  243. field = field[:len(field) - length_newline(field)];
  244. }
  245. if !r.lazy_quotes {
  246. if j := bytes.index_byte(field, '"'); j >= 0 {
  247. column := utf8.rune_count(full_line[:len(full_line) - len(line[j:])]);
  248. err = Reader_Error{
  249. kind = .Bare_Quote,
  250. start_line = record_line,
  251. line = r.line_count,
  252. column = column,
  253. };
  254. break parse_field;
  255. }
  256. }
  257. append(&r.record_buffer, ..field);
  258. append(&r.field_indices, len(r.record_buffer));
  259. if i >= 0 {
  260. line = line[i+comma_len:];
  261. continue parse_field;
  262. }
  263. break parse_field;
  264. } else {
  265. line = line[quote_len:];
  266. for {
  267. i := bytes.index_byte(line, '"');
  268. switch {
  269. case i >= 0:
  270. append(&r.record_buffer, ..line[:i]);
  271. line = line[i+quote_len:];
  272. switch ch := next_rune(line); {
  273. case ch == '"': // append quote
  274. append(&r.record_buffer, '"');
  275. line = line[quote_len:];
  276. case ch == r.comma: // end of field
  277. line = line[comma_len:];
  278. append(&r.field_indices, len(r.record_buffer));
  279. continue parse_field;
  280. case length_newline(line) == len(line): // end of line
  281. append(&r.field_indices, len(r.record_buffer));
  282. break parse_field;
  283. case r.lazy_quotes: // bare quote
  284. append(&r.record_buffer, '"');
  285. case: // invalid non-escaped quote
  286. column := utf8.rune_count(full_line[:len(full_line) - len(line) - quote_len]);
  287. err = Reader_Error{
  288. kind = .Quote,
  289. start_line = record_line,
  290. line = r.line_count,
  291. column = column,
  292. };
  293. break parse_field;
  294. }
  295. case len(line) > 0:
  296. append(&r.record_buffer, ..line);
  297. if err_read != nil {
  298. break parse_field;
  299. }
  300. line, err_read = read_line(r);
  301. if is_io_error(err_read, .EOF) {
  302. err_read = nil;
  303. }
  304. full_line = line;
  305. case:
  306. if !r.lazy_quotes && err_read == nil {
  307. column := utf8.rune_count(full_line);
  308. err = Reader_Error{
  309. kind = .Quote,
  310. start_line = record_line,
  311. line = r.line_count,
  312. column = column,
  313. };
  314. break parse_field;
  315. }
  316. append(&r.field_indices, len(r.record_buffer));
  317. break parse_field;
  318. }
  319. }
  320. }
  321. }
  322. if err == nil && err_read != nil {
  323. err = err_read;
  324. }
  325. context.allocator = allocator;
  326. dst := dst;
  327. str := string(r.record_buffer[:]);
  328. if dst == nil {
  329. // use local variable
  330. dst = &([dynamic]string){};
  331. }
  332. clear(dst);
  333. resize(dst, len(r.field_indices));
  334. pre_idx: int;
  335. for idx, i in r.field_indices {
  336. field := str[pre_idx:idx];
  337. if !r.reuse_record_buffer {
  338. field = strings.clone(field);
  339. }
  340. dst[i] = field;
  341. pre_idx = idx;
  342. }
  343. if r.fields_per_record > 0 {
  344. if len(dst) != r.fields_per_record && err == nil {
  345. err = Reader_Error{
  346. kind = .Field_Count,
  347. start_line = record_line,
  348. line = r.line_count,
  349. expected = r.fields_per_record,
  350. got = len(dst),
  351. };
  352. }
  353. } else if r.fields_per_record == 0 {
  354. r.fields_per_record = len(dst);
  355. }
  356. return dst[:], err;
  357. }