bug-gawk
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Records longer than INT_MAX mishandled


From: Miguel Pineiro Jr.
Subject: Re: Records longer than INT_MAX mishandled
Date: Tue, 04 May 2021 12:08:39 -0400
User-agent: Cyrus-JMAP/3.5.0-alpha0-442-g5daca166b9-fm-20210428.001-g5daca166

Hi, Arnold.

Here's a crack at size_t records.

Starting at the bottleneck, get_a_record gains a new parameter of type pointer 
to size_t. While the function's return type remains an int, the meaning of 
non-negative values is different.
    
Return Value           Unpatched             Patched
------------        --------------        --------------
          -2        Retryable Read        Retryable Read
         EOF        End of File           End of File
           0        Record Length         Valid Record
  1..INT_MAX        Record Length         <unused>

0 thru INT_MAX had been record lengths. This patch changes 0 to mean there's a 
valid record whose length is stored in the address pointed to by the new 
parameter. Positive values are no longer returned.

inrec, do_getline, do_getline_redir, and set_record now use only size_t record 
lengths.

The extension api wasn't altered. If it should match the capabilities of the 
internal reader, it'll need some tending to.

I did not visit any code that wasn't directly affected by get_a_record's 
modification, so it's possible that an audit may find bits in need of attention.

This diff compiled cleanly; the test suite completed without error; and the 
following tests produced the correct results:

# Test inrec in the main i/o loop. Correct result is 1.
gawk 'BEGIN {printf("%2147483648s\n", "a")}' | gawk 'END {print NR}'

# Test set_record from bare getline. Correct result is 2.
gawk 'BEGIN {printf("\n%2147483648s\n", "a")}' | gawk '{getline} END {print NR}'

# Test getline var. Correct result is 2.
gawk 'BEGIN {printf("\n%2147483648s\n", "a")}' | gawk '{getline var} END {print 
NR}'



diff --git a/awk.h b/awk.h
index c9eec663..daee6e63 100644
--- a/awk.h
+++ b/awk.h
@@ -1553,7 +1553,7 @@ extern NODE *get_actual_argument(NODE *, int, bool);
 #endif
 /* field.c */
 extern void init_fields(void);
-extern void set_record(const char *buf, int cnt, const awk_fieldwidth_info_t 
*);
+extern void set_record(const char *buf, size_t cnt, const 
awk_fieldwidth_info_t *);
 extern void reset_record(void);
 extern void rebuild_record(void);
 extern void set_NF(void);
diff --git a/field.c b/field.c
index c21046b9..be620cb5 100644
--- a/field.c
+++ b/field.c
@@ -261,13 +261,13 @@ rebuild_record()
  * but better correct than fast.
  */
 void
-set_record(const char *buf, int cnt, const awk_fieldwidth_info_t *fw)
+set_record(const char *buf, size_t cnt, const awk_fieldwidth_info_t *fw)
 {
        NODE *n;
        static char *databuf;
-       static unsigned long databuf_size;
+       static size_t databuf_size;
 #define INITIAL_SIZE   512
-#define MAX_SIZE       ((unsigned long) ~0)    /* maximally portable ... */
+#define MAX_SIZE       ((size_t) ~0)   /* maximally portable ... */
 
        purge_record();
 
diff --git a/io.c b/io.c
index 91c94d9b..00e536dd 100644
--- a/io.c
+++ b/io.c
@@ -287,7 +287,7 @@ static RECVALUE rsrescan(IOBUF *iop, struct recmatch *recm, 
SCANSTATE *state);
 
 static RECVALUE (*matchrec)(IOBUF *iop, struct recmatch *recm, SCANSTATE 
*state) = rs1scan;
 
-static int get_a_record(char **out, IOBUF *iop, int *errcode, const 
awk_fieldwidth_info_t **field_width);
+static int get_a_record(char **out, size_t *len, IOBUF *iop, int *errcode, 
const awk_fieldwidth_info_t **field_width);
 
 static void free_rp(struct redirect *rp);
 
@@ -585,21 +585,19 @@ bool
 inrec(IOBUF *iop, int *errcode)
 {
        char *begin;
-       int cnt;
-       bool retval = true;
+       size_t cnt;
+       bool retval;
        const awk_fieldwidth_info_t *field_width = NULL;
 
        if (at_eof(iop) && no_data_left(iop))
-               cnt = EOF;
+               retval = false;
        else if ((iop->flag & IOP_CLOSED) != 0)
-               cnt = EOF;
+               retval = false;
        else
-               cnt = get_a_record(& begin, iop, errcode, & field_width);
+               /* Note that get_a_record may return -2 when I/O would block */
+               retval = (get_a_record(& begin, & cnt, iop, errcode, & 
field_width) == 0);
 
-       /* Note that get_a_record may return -2 when I/O would block */
-       if (cnt < 0) {
-               retval = false;
-       } else {
+       if (retval) {
                INCREMENT_REC(NR);
                INCREMENT_REC(FNR);
                set_record(begin, cnt, field_width);
@@ -2843,7 +2841,8 @@ do_getline_redir(int into_variable, enum redirval 
redirtype)
 {
        struct redirect *rp = NULL;
        IOBUF *iop;
-       int cnt = EOF;
+       size_t cnt;
+       int retval = EOF;
        char *s = NULL;
        int errcode;
        NODE *redir_exp = NULL;
@@ -2878,14 +2877,14 @@ do_getline_redir(int into_variable, enum redirval 
redirtype)
                return make_number((AWKNUM) 0.0);
 
        errcode = 0;
-       cnt = get_a_record(& s, iop, & errcode, (lhs ? NULL : & field_width));
+       retval = get_a_record(& s, & cnt, iop, & errcode, (lhs ? NULL : & 
field_width));
        if (errcode != 0) {
                if (! do_traditional && (errcode != -1))
                        update_ERRNO_int(errcode);
-               return make_number((AWKNUM) cnt);
+               return make_number((AWKNUM) retval);
        }
 
-       if (cnt == EOF) {
+       if (retval == EOF) {
                /*
                 * Don't do iop_close() here if we are
                 * reading from a pipe; otherwise
@@ -2915,7 +2914,8 @@ do_getline_redir(int into_variable, enum redirval 
redirtype)
 NODE *
 do_getline(int into_variable, IOBUF *iop)
 {
-       int cnt = EOF;
+       size_t cnt;
+       int retval = EOF;
        char *s = NULL;
        int errcode;
        const awk_fieldwidth_info_t *field_width = NULL;
@@ -2927,16 +2927,16 @@ do_getline(int into_variable, IOBUF *iop)
        }
 
        errcode = 0;
-       cnt = get_a_record(& s, iop, & errcode, (into_variable ? NULL : & 
field_width));
+       retval = get_a_record(& s, & cnt, iop, & errcode, (into_variable ? NULL 
: & field_width));
        if (errcode != 0) {
                if (! do_traditional && (errcode != -1))
                        update_ERRNO_int(errcode);
                if (into_variable)
                        (void) POP_ADDRESS();
-               return make_number((AWKNUM) cnt);
+               return make_number((AWKNUM) retval);
        }
 
-       if (cnt == EOF)
+       if (retval == EOF)
                return NULL;    /* try next file */
        INCREMENT_REC(NR);
        INCREMENT_REC(FNR);
@@ -3881,13 +3881,14 @@ errno_io_retry(void)
 
 /*
  * get_a_record --- read a record from IOP into out,
- * return length of EOF, set RT.
+ * its length into len, and set RT.
+ * return 0 on success, EOF when out of data, and -2 if I/O would block.
  * Note that errcode is never NULL, and the caller initializes *errcode to 0.
- * If I/O would block, return -2.
  */
 
 static int
 get_a_record(char **out,        /* pointer to pointer to data */
+        size_t *len,            /* pointer to record length */
         IOBUF *iop,             /* input IOP */
         int *errcode,           /* pointer to error variable */
         const awk_fieldwidth_info_t **field_width)
@@ -3896,7 +3897,6 @@ get_a_record(char **out,        /* pointer to pointer to 
data */
        struct recmatch recm;
        SCANSTATE state;
        RECVALUE ret;
-       int retval;
        NODE *rtval = NULL;
        static RECVALUE (*lastmatchrec)(IOBUF *iop, struct recmatch *recm, 
SCANSTATE *state) = NULL;
 
@@ -3915,6 +3915,9 @@ get_a_record(char **out,        /* pointer to pointer to 
data */
                if (rc == EOF)
                        iop->flag |= IOP_AT_EOF;
                else {
+                       assert(rc >= 0);
+                       *len = rc;
+                       rc = 0;
                        if (rt_len != 0)
                                set_RT(rt_start, rt_len);
                        else
@@ -4074,11 +4077,11 @@ get_a_record(char **out,        /* pointer to pointer 
to data */
 
        if (recm.len == 0) {
                *out = NULL;
-               retval = 0;
+               *len = 0;
        } else {
                assert(recm.start != NULL);
                *out = recm.start;
-               retval = recm.len;
+               *len = recm.len;
        }
 
        iop->off += recm.len + recm.rt_len;
@@ -4086,7 +4089,7 @@ get_a_record(char **out,        /* pointer to pointer to 
data */
        if (recm.len == 0 && recm.rt_len == 0 && at_eof(iop))
                return EOF;
        else
-               return retval;
+               return 0;
 }
 
 /* set_RS --- update things as appropriate when RS is set */



On Tue, May 4, 2021, at 4:17 AM, arnold@skeeve.com wrote:
> Hi.
> 
> Thanks for the report and patch.
> 
> If you can produce a patch to allow records to be SIZE_T_MAX (or whatever
> the right constant is), I'd be interested.
> 
> I will try to find some time to look into this also.
> 
> Arnold
> 
> "Miguel Pineiro Jr." <mpj@pineiro.cc> wrote:
> 
> > Hello, gawk devs.
> >
> > gawk mishandles records longer than INT_MAX when get_a_record stuffs their 
> > size_t length in an int (io.c:4081: `retval = recm.len`).
> >
> > All of the following examples are paired, first a success using a record of 
> > length INT_MAX, then a failure using INT_MAX + 1.
> >
> >
> > In the main i/o loop, records vanish when their corrupted length is 
> > negative, since inrec doesn't consider a negative value a valid record.
> >
> > $ gawk 'BEGIN {printf("%2147483647s\n", "a")}' | gawk 'END {print NR}'
> > 1
> > $ gawk 'BEGIN {printf("%2147483648s\n", "a")}' | gawk 'END {print NR}'
> > 0
> >
> >
> > In getline (do_getline/do_getline_redir), if the corrupted length is equal 
> > to EOF, it will trigger a silent bypass of the rest of the file. More 
> > likely, some other value will mislead buffer memory management routines and 
> > crash gawk.
> >
> > This bare getline fails fatally in set_record's buffer resizing loop, when 
> > it gives up trying to accomodate what it thinks is a humongous record 
> > (field.c:284: `cnt >= databuf_size` promotes a negative int cnt to unsigned 
> > long).
> >
> > $ gawk 'BEGIN {printf("\n%2147483647s\n", "a")}' | gawk '{getline} END 
> > {print NR}'
> > 2
> > $ gawk 'BEGIN {printf("\n%2147483648s\n", "a")}' | gawk '{getline} END 
> > {print NR}'
> > gawk: cmd. line:1: (FILENAME=- FNR=2) fatal: input record too large
> >
> >
> > This getline var dies in make_string (make_str_node) from a corrupted 
> > allocation request:
> >
> > $ gawk 'BEGIN {printf("\n%2147483647s\n", "a")}' | gawk '{getline var} END 
> > {print NR}'
> > 2
> > $ gawk 'BEGIN {printf("\n%2147483648s\n", "a")}' | gawk '{getline var} END 
> > {print NR}'
> > gawk: cmd. line:1: (FILENAME=- FNR=2) fatal: node.c:415:make_str_node: 
> > r->stptr: cannot allocate -2147483647 bytes of memory: Cannot allocate 
> > memory
> >
> >
> > If INT_MAX is deemed sufficient, despite the use of capacious size_t i/o 
> > buffers, here's a diff.
> >
> > diff --git a/io.c b/io.c
> > index 91c94d9b..4e777d75 100644
> > --- a/io.c
> > +++ b/io.c
> > @@ -4026,6 +4026,9 @@ get_a_record(char **out,        /* pointer to pointer 
> > to data */
> >                     iop->dataend += iop->count;
> >     }
> >  
> > +   if (recm.len > INT_MAX)
> > +           fatal(_("input record length too large to return"));
> > +
> >     /* set record, RT, return right value */
> >  
> >     /*
> 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]