Browse Source

Updated to latest zsv. Resolves #7.

Brucey 2 years ago
parent
commit
60508d18ff

+ 3 - 1
csv.mod/csv.bmx

@@ -25,12 +25,14 @@ bbdoc: A CSV parser.
 End Rem
 Module Text.CSV
 
-ModuleInfo "Version: 1.00"
+ModuleInfo "Version: 1.01"
 ModuleInfo "Author: Bruce A Henderson"
 ModuleInfo "License: MIT"
 ModuleInfo "zsv - Copyright (c) 2021 Guarnerix Inc dba Liquidaty"
 ModuleInfo "Copyright: 2022 Bruce A Henderson"
 
+ModuleInfo "History: 1.01"
+ModuleInfo "History: Updated to latest zsv."
 ModuleInfo "History: 1.00"
 ModuleInfo "History: Initial Release"
 

+ 1 - 0
csv.mod/zsv/include/zsv/common.h

@@ -57,6 +57,7 @@ struct zsv_cell {
 #  define ZSV_PARSER_QUOTE_CLOSED   2 /* value was quoted */
 #  define ZSV_PARSER_QUOTE_NEEDED   4 /* value contains delimiter or dbl-quote */
 #  define ZSV_PARSER_QUOTE_EMBEDDED 8 /* value contains dbl-quote */
+#  define ZSV_PARSER_QUOTE_PENDING 16 /* only used internally by parser */
   /**
    * quoted flags enable additional efficiency, in particular when input data will
    * be output as text (csv, json etc), by indicating whether the cell contents may

+ 4 - 5
csv.mod/zsv/src/zsv.c

@@ -39,12 +39,10 @@ inline static size_t scanner_pre_parse(struct zsv_scanner *scanner) {
   scanner->last = '\0';
   if(VERY_LIKELY(scanner->old_bytes_read)) {
     scanner->last = scanner->buff.buff[scanner->old_bytes_read-1];
+
     if(scanner->row_start < scanner->old_bytes_read) {
       size_t len = scanner->old_bytes_read - scanner->row_start;
-      if(len < scanner->row_start)
-        memcpy(scanner->buff.buff, scanner->buff.buff + scanner->row_start, len);
-      else
-        memmove(scanner->buff.buff, scanner->buff.buff + scanner->row_start, len);
+      memmove(scanner->buff.buff, scanner->buff.buff + scanner->row_start, len);
       scanner->partial_row_length = len;
     } else {
       scanner->cell_start = 0;
@@ -54,6 +52,7 @@ inline static size_t scanner_pre_parse(struct zsv_scanner *scanner) {
     scanner->cell_start -= scanner->row_start;
     for(size_t i2 = 0; i2 < scanner->row.used; i2++)
       scanner->row.cells[i2].str -= scanner->row_start;
+
     scanner->row_start = 0;
     scanner->old_bytes_read = 0;
   }
@@ -61,6 +60,7 @@ inline static size_t scanner_pre_parse(struct zsv_scanner *scanner) {
   scanner->cum_scanned_length += scanner->scanned_length;
 
   size_t capacity = scanner->buff.size - scanner->partial_row_length;
+
   if(VERY_UNLIKELY(capacity == 0)) { // our row size was too small to fit a single row of data
     fprintf(stderr, "Warning: row truncated\n");
     if(scanner->mode == ZSV_MODE_FIXED) {
@@ -79,7 +79,6 @@ inline static size_t scanner_pre_parse(struct zsv_scanner *scanner) {
   return capacity;
 }
 
-
 /**
  * Read the next chunk of data from our input stream and parse it, calling our
  * custom handlers as each cell and row are parsed

+ 20 - 20
csv.mod/zsv/src/zsv_internal.c

@@ -247,7 +247,10 @@ __attribute__((always_inline)) static inline void zsv_clear_cell(struct zsv_scan
 }
 
 // always_inline has a noticeable impact. do not remove without benchmarking!
-__attribute__((always_inline)) static inline void cell_dl(struct zsv_scanner * scanner, unsigned char * s, size_t n) {
+#ifdef NDEBUG
+__attribute__((always_inline)) static inline
+#endif
+void cell_dl(struct zsv_scanner * scanner, unsigned char * s, size_t n) {
   // handle quoting
   if(UNLIKELY(scanner->quoted > 0)) {
     if(LIKELY(scanner->quote_close_position + 1 == n)) {
@@ -526,26 +529,23 @@ static void collate_header_row(void *ctx) {
   if(!scanner->opts.header_span) {
     // finished with header; combine all rows into a single row
     set_callbacks(scanner);
-//    if(VERY_LIKELY(scanner->opts.row_handler != NULL || scanner->opts.cell_handler != NULL
-//                   || scanner->mode == ZSV_MODE_DELIM_PULL)) {
-      if(scanner->collate_header) {
-        size_t offset = 0;
-        for(size_t i = 0; i < scanner->collate_header->column_count; i++) {
-          size_t len_plus1 = scanner->collate_header->lengths[i];
-          scanner->row.cells[i].str = scanner->collate_header->buff.buff + offset;
-          if(len_plus1) {
-            scanner->row.cells[i].len = len_plus1 - 1;
-            scanner->row.cells[i].quoted = 1;
-          } else
-            scanner->row.cells[i].len = 0;
-          offset += len_plus1;
-        }
+    if(scanner->collate_header) {
+      size_t offset = 0;
+      for(size_t i = 0; i < scanner->collate_header->column_count; i++) {
+        size_t len_plus1 = scanner->collate_header->lengths[i];
+        scanner->row.cells[i].str = scanner->collate_header->buff.buff + offset;
+        if(len_plus1) {
+          scanner->row.cells[i].len = len_plus1 - 1;
+          scanner->row.cells[i].quoted = 1;
+        } else
+          scanner->row.cells[i].len = 0;
+        offset += len_plus1;
       }
+    }
 
-      apply_callbacks(scanner);
-      if(scanner->mode != ZSV_MODE_DELIM_PULL)
-        collate_header_destroy(&scanner->collate_header);
-//    }
+    apply_callbacks(scanner);
+    if(scanner->mode != ZSV_MODE_DELIM_PULL)
+      collate_header_destroy(&scanner->collate_header);
   }
 }
 
@@ -580,7 +580,7 @@ static void zsv_throwaway_row(void *ctx) {
 
 static int zsv_scanner_init(struct zsv_scanner *scanner,
                               struct zsv_opts *opts) {
-  size_t need_buff_size = 0; // opts->buffsize
+  size_t need_buff_size = 0;
   if(opts->buffsize < opts->max_row_size * 2)
     need_buff_size = opts->max_row_size * 2;
   opts->delimiter = opts->delimiter ? opts->delimiter : ',';

+ 4 - 5
csv.mod/zsv/src/zsv_scan_delim.c

@@ -72,7 +72,6 @@ static enum zsv_status ZSV_SCAN_DELIM(struct zsv_scanner *scanner,
   skip_next_delim = 0;
   bytes_chunk_end = bytes_read >= sizeof(zsv_uc_vector) ? bytes_read - sizeof(zsv_uc_vector) + 1 : 0;
   delimiter = scanner->opts.delimiter;
-
   scanner->partial_row_length = 0;
 
   // to do: move into one-time execution code?
@@ -85,9 +84,8 @@ static enum zsv_status ZSV_SCAN_DELIM(struct zsv_scanner *scanner,
   // case "hel"|"o": check if we have an embedded dbl-quote past the initial opening quote, which was
   // split between the last buffer and this one e.g. "hel""o" where the last buffer ended
   // with "hel" and this one starts with "o"
-  if((scanner->quoted & ZSV_PARSER_QUOTE_UNCLOSED)
-     && i > scanner->cell_start + 1 // case "|hello": need the + 1 in case split after first char of quoted value e.g. "hello" => " and hello"
-     && scanner->last == quote) {
+  if(scanner->quoted & ZSV_PARSER_QUOTE_PENDING) {
+    scanner->quoted -= ZSV_PARSER_QUOTE_PENDING;
     if(buff[i] != quote) {
       scanner->quoted |= ZSV_PARSER_QUOTE_CLOSED;
       scanner->quoted -= ZSV_PARSER_QUOTE_UNCLOSED;
@@ -229,7 +227,8 @@ static enum zsv_status ZSV_SCAN_DELIM(struct zsv_scanner *scanner,
             scanner->quoted |= ZSV_PARSER_QUOTE_EMBEDDED;
             skip_next_delim = 1;
           }
-        }
+        } else // we are at the end of the input!
+          scanner->quoted |= ZSV_PARSER_QUOTE_PENDING;
       } else {
         // cell_length > 0 and cell did not start w quote, so
         // we have a quote in middle of an unquoted cell