[SCM] gawk branch, feature/bwk-csv, created. gawk-4.1.0-4896-g1ee8627c

gawk-diffs
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[SCM] gawk branch, feature/bwk-csv, created. gawk-4.1.0-4896-g1ee8627c

From:	Arnold Robbins
Subject:	[SCM] gawk branch, feature/bwk-csv, created. gawk-4.1.0-4896-g1ee8627c
Date:	Mon, 29 Aug 2022 08:00:00 -0400 (EDT)
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gawk".

The branch, feature/bwk-csv has been created
        at  1ee8627c7bb42dad235c66e62050bf61f59cbb6e (commit)

- Log -----------------------------------------------------------------
http://git.sv.gnu.org/cgit/gawk.git/commit/?id=1ee8627c7bb42dad235c66e62050bf61f59cbb6e

commit 1ee8627c7bb42dad235c66e62050bf61f59cbb6e
Author: Arnold D. Robbins <arnold@skeeve.com>
Date:   Mon Aug 29 14:59:29 2022 +0300

    First cut at CSV support a la BWK awk.

diff --git a/ChangeLog b/ChangeLog
index 46286435..9090eaca 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2022-08-29         Arnold D. Robbins     <arnold@skeeve.com>
+
+       * field.c (comma_parse_field, set_comma_field): New functions.
+       (set_FS): If FS is "," and not posix mode, use CSV parsing.
+       Code follows what BWK's new code does.
+
 2022-08-25         Arnold D. Robbins     <arnold@skeeve.com>
 
        * awkgram.y (yyerror): Exit at the end, to make syntax errors
diff --git a/field.c b/field.c
index 0232cd5a..755a3fd4 100644
--- a/field.c
+++ b/field.c
@@ -59,12 +59,15 @@ static long sc_parse_field(long, char **, int, NODE *,
                             Regexp *, Setfunc, NODE *, NODE *, bool);
 static long fw_parse_field(long, char **, int, NODE *,
                             Regexp *, Setfunc, NODE *, NODE *, bool);
+static long comma_parse_field(long, char **, int, NODE *,
+                            Regexp *, Setfunc, NODE *, NODE *, bool);
 static const awk_fieldwidth_info_t *api_fw = NULL;
 static long fpat_parse_field(long, char **, int, NODE *,
                             Regexp *, Setfunc, NODE *, NODE *, bool);
 static void set_element(long num, char * str, long len, NODE *arr);
 static void grow_fields_arr(long num);
 static void set_field(long num, char *str, long len, NODE *dummy);
+static void set_comma_field(long num, char *str, long len, NODE *dummy);
 static void purge_record(void);
 
 static char *parse_extent;     /* marks where to restart parse of record */
@@ -147,6 +150,26 @@ set_field(long num,
        n->flags = (STRCUR|STRING|USER_INPUT);  /* do not set MALLOC */
 }
 
+/* set_comma_field --- set the value of a particular field, coming from CSV */
+
+/*ARGSUSED*/
+static void
+set_comma_field(long num,
+       char *str,
+       long len,
+       NODE *dummy ATTRIBUTE_UNUSED)   /* just to make interface same as 
set_element */
+{
+       NODE *n;
+       NODE *val = make_string(str, len);
+
+       if (num > nf_high_water)
+               grow_fields_arr(num);
+       n = fields_arr[num];
+       n->stptr = val->stptr;
+       n->stlen = val->stlen;
+       n->flags = (STRCUR|STRING|USER_INPUT|MALLOC);
+}
+
 /* rebuild_record --- Someone assigned a value to $(something).
                        Fix up $0 to be right */
 
@@ -740,6 +763,98 @@ sc_parse_field(long up_to, /* parse only up to this field 
number */
        return nf;
 }
 
+/*
+ * comma_parse_field --- CSV parsing same as BWK awk.
+ *
+ * This is called both from get_field() and from do_split()
+ * via (*parse_field)().  This variation is for when FS is a comma,
+ * we do very basic CSV parsing, the same as BWK awk.
+ */
+static long
+comma_parse_field(long up_to,  /* parse only up to this field number */
+       char **buf,     /* on input: string to parse; on output: point to start 
next */
+       int len,
+       NODE *fs,
+       Regexp *rp ATTRIBUTE_UNUSED,
+       Setfunc set,    /* routine to set the value of the parsed field */
+       NODE *n,
+       NODE *sep_arr,  /* array of field separators (maybe NULL) */
+       bool in_middle ATTRIBUTE_UNUSED)
+{
+       char *scan = *buf;
+       static const char comma = ',';
+       long nf = parse_high_water;
+       char *field;
+       char *end = scan + len;
+
+       static char *newfield = NULL;
+       static size_t buflen = 0;
+
+       if (newfield == NULL) {
+               emalloc(newfield, char *, BUFSIZ, "comma_parse_field");
+               buflen = BUFSIZ;
+       }
+
+       if (set == set_field)   // not an array element
+               set = set_comma_field;
+
+       if (up_to == UNLIMITED)
+               nf = 0;
+
+       if (len == 0) {
+               (*set)(++nf, newfield, 0L, n);
+               return nf;
+       }
+
+       for (; nf < up_to;) {
+               char *new_end = newfield;
+               memset(newfield, '\0', buflen);
+
+               while (*scan != comma && scan < end) {
+                       if (*scan == '"') {
+                               for (scan++; scan < end;) {
+                                       if (*scan == '"' && scan[1] == '"') {   
// "" -> "
+                                               *new_end++ = '"';
+                                               scan += 2;
+                                       } else if (*scan == '"' && (scan == 
end-1 || scan[1] == comma)) {
+                                               // close of quoted string
+                                               scan++;
+                                               break;
+                                       } else {
+                                               // grow buffer if needed
+                                               *new_end++ = *scan++;
+                                       }
+                               }
+                       } else {
+                               // unquoted field
+                               while (*scan != comma && scan < end) {
+                                       // grow buffer if needed
+                                       *new_end++ = *scan++;
+                               }
+                       }
+               }
+
+               (*set)(++nf, newfield, (long)(new_end - newfield), n);
+
+               if (scan == end)
+                       break;
+
+               if (scan == *buf) {
+                       scan++;
+                       continue;
+               }
+
+               scan++;
+               if (scan == end) {      /* FS at end of record */
+                       (*set)(++nf, newfield, 0L, n);
+                       break;
+               }
+       }
+
+       *buf = scan;
+       return nf;
+}
+
 /*
  * calc_mbslen --- calculate the length in bytes of a multi-byte string
  * containing len characters.
@@ -1309,7 +1424,8 @@ set_FS()
        save_rs = dupnode(RS_node->var_value);
        resave_fs = true;
 
-       /* If FS_re_no_case assignment is fatal (make_regexp in remake_re)
+       /*
+        * If FS_re_no_case assignment is fatal (make_regexp in remake_re)
         * FS_regexp will be NULL with a non-null FS_re_yes_case.
         * refree() handles null argument; no need for `if (FS_regexp != NULL)' 
below.
         * Please do not remerge.
@@ -1363,6 +1479,8 @@ choose_fs_function:
                        else if (fs->stptr[0] == '\\')
                                /* same special case */
                                strcpy(buf, "[\\\\]");
+                       else if (fs->stptr[0] == ',' && ! do_posix)
+                               set_parser(comma_parse_field);
                        else
                                set_parser(sc_parse_field);
                }
diff --git a/pc/ChangeLog b/pc/ChangeLog
index 9b09d8e9..f5d09d28 100644
--- a/pc/ChangeLog
+++ b/pc/ChangeLog
@@ -1,3 +1,7 @@
+2022-08-29         Arnold D. Robbins     <arnold@skeeve.com>
+
+       * Makefile.tst: Regenerated.
+
 2022-08-25         Arnold D. Robbins     <arnold@skeeve.com>
 
        * Makefile.tst: Regenerated.
diff --git a/pc/Makefile.tst b/pc/Makefile.tst
index e8e75e0f..1ffd3f0d 100644
--- a/pc/Makefile.tst
+++ b/pc/Makefile.tst
@@ -188,7 +188,7 @@ GAWK_EXT_TESTS = \
        aadelete1 aadelete2 aarray1 aasort aasorti argtest arraysort \
        arraysort2 arraytype asortbool backw badargs beginfile1 beginfile2 \
        binmode1 charasbytes clos1way clos1way2 clos1way3 clos1way4 \
-       clos1way5 clos1way6 colonwarn commas crlf dbugeval dbugeval2 \
+       clos1way5 clos1way6 colonwarn commas crlf csv1 dbugeval dbugeval2 \
        dbugeval3 dbugtypedre1 dbugtypedre2 delsub devfd devfd1 devfd2 \
        dfacheck1 dumpvars errno exit fieldwdth forcenum fpat1 fpat2 \
        fpat3 fpat4 fpat5 fpat6 fpat7 fpat8 fpat9 fpatnull fsfwfs functab1 \
@@ -2704,6 +2704,11 @@ crlf:
        @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  >_$@ 2>&1 || echo EXIT CODE: 
$$? >>_$@
        @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
 
+csv1:
+       @echo $@
+       @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  < "$(srcdir)"/$@.in >_$@ 2>&1 
|| echo EXIT CODE: $$? >>_$@
+       @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
 dbugeval2:
        @echo $@
        @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  --debug < "$(srcdir)"/$@.in 
>_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
diff --git a/test/ChangeLog b/test/ChangeLog
index 3a2691ed..c8f37a30 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,8 @@
+2022-08-29         Arnold D. Robbins     <arnold@skeeve.com>
+
+       * Makefile.am (EXTRA_DIST): csv1: New test.
+       * csv1.awk, csv1.in, csv1.ok: New files.
+
 2022-08-25         Arnold D. Robbins     <arnold@skeeve.com>
 
        * Makefile.am (EXTRA_DIST): nsbad2, nsbad3: new tests.
diff --git a/test/Makefile.am b/test/Makefile.am
index 962885b5..36068728 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -223,6 +223,9 @@ EXTRA_DIST = \
        convfmt.ok \
        crlf.awk \
        crlf.ok \
+       csv1.awk \
+       csv1.in \
+       csv1.ok \
        datanonl.awk \
        datanonl.in \
        datanonl.ok \
@@ -1478,7 +1481,7 @@ GAWK_EXT_TESTS = \
        aadelete1 aadelete2 aarray1 aasort aasorti argtest arraysort \
        arraysort2 arraytype asortbool backw badargs beginfile1 beginfile2 \
        binmode1 charasbytes clos1way clos1way2 clos1way3 clos1way4 \
-       clos1way5 clos1way6 colonwarn commas crlf dbugeval dbugeval2 \
+       clos1way5 clos1way6 colonwarn commas crlf csv1 dbugeval dbugeval2 \
        dbugeval3 dbugtypedre1 dbugtypedre2 delsub devfd devfd1 devfd2 \
        dfacheck1 dumpvars errno exit fieldwdth forcenum fpat1 fpat2 \
        fpat3 fpat4 fpat5 fpat6 fpat7 fpat8 fpat9 fpatnull fsfwfs functab1 \
diff --git a/test/Makefile.in b/test/Makefile.in
index f7ac3cfa..a2057e72 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -491,6 +491,9 @@ EXTRA_DIST = \
        convfmt.ok \
        crlf.awk \
        crlf.ok \
+       csv1.awk \
+       csv1.in \
+       csv1.ok \
        datanonl.awk \
        datanonl.in \
        datanonl.ok \
@@ -1746,7 +1749,7 @@ GAWK_EXT_TESTS = \
        aadelete1 aadelete2 aarray1 aasort aasorti argtest arraysort \
        arraysort2 arraytype asortbool backw badargs beginfile1 beginfile2 \
        binmode1 charasbytes clos1way clos1way2 clos1way3 clos1way4 \
-       clos1way5 clos1way6 colonwarn commas crlf dbugeval dbugeval2 \
+       clos1way5 clos1way6 colonwarn commas crlf csv1 dbugeval dbugeval2 \
        dbugeval3 dbugtypedre1 dbugtypedre2 delsub devfd devfd1 devfd2 \
        dfacheck1 dumpvars errno exit fieldwdth forcenum fpat1 fpat2 \
        fpat3 fpat4 fpat5 fpat6 fpat7 fpat8 fpat9 fpatnull fsfwfs functab1 \
@@ -4445,6 +4448,11 @@ crlf:
        @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  >_$@ 2>&1 || echo EXIT CODE: 
$$? >>_$@
        @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
 
+csv1:
+       @echo $@
+       @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  < "$(srcdir)"/$@.in >_$@ 2>&1 
|| echo EXIT CODE: $$? >>_$@
+       @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
 dbugeval2:
        @echo $@
        @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  --debug < "$(srcdir)"/$@.in 
>_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
diff --git a/test/Maketests b/test/Maketests
index 8b88ed83..a4d685a7 100644
--- a/test/Maketests
+++ b/test/Maketests
@@ -1402,6 +1402,11 @@ crlf:
        @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  >_$@ 2>&1 || echo EXIT CODE: 
$$? >>_$@
        @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
 
+csv1:
+       @echo $@
+       @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  < "$(srcdir)"/$@.in >_$@ 2>&1 
|| echo EXIT CODE: $$? >>_$@
+       @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
 dbugeval2:
        @echo $@
        @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  --debug < "$(srcdir)"/$@.in 
>_$@ 2>&1 || echo EXIT CODE: $$? >>_$@
diff --git a/test/csv1.awk b/test/csv1.awk
new file mode 100644
index 00000000..12bbf1e5
--- /dev/null
+++ b/test/csv1.awk
@@ -0,0 +1,10 @@
+BEGIN {
+       FS = ","
+}
+
+{
+       printf(" \t%s\t", $0)
+       for (i = 1; i <= NF; i++)
+               printf("[%s]", $i)
+       print ""
+}
diff --git a/test/csv1.in b/test/csv1.in
new file mode 100644
index 00000000..620b2ab3
--- /dev/null
+++ b/test/csv1.in
@@ -0,0 +1,26 @@
+a
+  a
+,a
+ , a
+a,b
+a,b,c
+""
+"abc"
+"a""b"
+"a","b"
+a""b
+"a,b"
+""""
+""""""
+"""x"""
+,,""
+a""b
+a"b
+a''b
+"abc
+,,
+a,
+"",
+,
+"abc",def
+
diff --git a/test/csv1.ok b/test/csv1.ok
new file mode 100644
index 00000000..178daed1
--- /dev/null
+++ b/test/csv1.ok
@@ -0,0 +1,26 @@
+       a       [a]
+         a     [  a]
+       ,a      [][a]
+        , a    [ ][ a]
+       a,b     [a][b]
+       a,b,c   [a][b][c]
+       ""      []
+       "abc"   [abc]
+       "a""b"  [a"b]
+       "a","b" [a][b]
+       a""b    [a""b]
+       "a,b"   [a,b]
+       """"    ["]
+       """"""  [""]
+       """x""" ["x"]
+       ,,""    [][][]
+       a""b    [a""b]
+       a"b     [a"b]
+       a''b    [a''b]
+       "abc    [abc]
+       ,,      [][][]
+       a,      [a][]
+       "",     [][]
+       ,       [][]
+       "abc",def       [abc][def]
+               []

-----------------------------------------------------------------------


hooks/post-receive
-- 
gawk
[Prev in Thread]
Current Thread
[Next in Thread]
[SCM] gawk branch, feature/bwk-csv, created. gawk-4.1.0-4896-g1ee8627c, Arnold Robbins <=
Prev by Date: [SCM] gawk branch, feature/improve-inet, updated. gawk-4.1.0-5535-gaaf639e2
Next by Date: [SCM] gawk branch, feature/bwk-csv, updated. gawk-4.1.0-4897-g2571e2d5
Previous by thread: [SCM] gawk branch, feature/improve-inet, updated. gawk-4.1.0-5535-gaaf639e2
Next by thread: [SCM] gawk branch, feature/bwk-csv, updated. gawk-4.1.0-4897-g2571e2d5
Index(es):
- Date
- Thread