[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: Records longer than INT_MAX mishandled
From: |
Miguel Pineiro Jr. |
Subject: |
Re: Records longer than INT_MAX mishandled |
Date: |
Tue, 04 May 2021 12:08:39 -0400 |
User-agent: |
Cyrus-JMAP/3.5.0-alpha0-442-g5daca166b9-fm-20210428.001-g5daca166 |
Hi, Arnold.
Here's a crack at size_t records.
Starting at the bottleneck, get_a_record gains a new parameter of type pointer
to size_t. While the function's return type remains an int, the meaning of
non-negative values is different.
Return Value Unpatched Patched
------------ -------------- --------------
-2 Retryable Read Retryable Read
EOF End of File End of File
0 Record Length Valid Record
1..INT_MAX Record Length <unused>
0 thru INT_MAX had been record lengths. This patch changes 0 to mean there's a
valid record whose length is stored in the address pointed to by the new
parameter. Positive values are no longer returned.
inrec, do_getline, do_getline_redir, and set_record now use only size_t record
lengths.
The extension api wasn't altered. If it should match the capabilities of the
internal reader, it'll need some tending to.
I did not visit any code that wasn't directly affected by get_a_record's
modification, so it's possible that an audit may find bits in need of attention.
This diff compiled cleanly; the test suite completed without error; and the
following tests produced the correct results:
# Test inrec in the main i/o loop. Correct result is 1.
gawk 'BEGIN {printf("%2147483648s\n", "a")}' | gawk 'END {print NR}'
# Test set_record from bare getline. Correct result is 2.
gawk 'BEGIN {printf("\n%2147483648s\n", "a")}' | gawk '{getline} END {print NR}'
# Test getline var. Correct result is 2.
gawk 'BEGIN {printf("\n%2147483648s\n", "a")}' | gawk '{getline var} END {print
NR}'
diff --git a/awk.h b/awk.h
index c9eec663..daee6e63 100644
--- a/awk.h
+++ b/awk.h
@@ -1553,7 +1553,7 @@ extern NODE *get_actual_argument(NODE *, int, bool);
#endif
/* field.c */
extern void init_fields(void);
-extern void set_record(const char *buf, int cnt, const awk_fieldwidth_info_t
*);
+extern void set_record(const char *buf, size_t cnt, const
awk_fieldwidth_info_t *);
extern void reset_record(void);
extern void rebuild_record(void);
extern void set_NF(void);
diff --git a/field.c b/field.c
index c21046b9..be620cb5 100644
--- a/field.c
+++ b/field.c
@@ -261,13 +261,13 @@ rebuild_record()
* but better correct than fast.
*/
void
-set_record(const char *buf, int cnt, const awk_fieldwidth_info_t *fw)
+set_record(const char *buf, size_t cnt, const awk_fieldwidth_info_t *fw)
{
NODE *n;
static char *databuf;
- static unsigned long databuf_size;
+ static size_t databuf_size;
#define INITIAL_SIZE 512
-#define MAX_SIZE ((unsigned long) ~0) /* maximally portable ... */
+#define MAX_SIZE ((size_t) ~0) /* maximally portable ... */
purge_record();
diff --git a/io.c b/io.c
index 91c94d9b..00e536dd 100644
--- a/io.c
+++ b/io.c
@@ -287,7 +287,7 @@ static RECVALUE rsrescan(IOBUF *iop, struct recmatch *recm,
SCANSTATE *state);
static RECVALUE (*matchrec)(IOBUF *iop, struct recmatch *recm, SCANSTATE
*state) = rs1scan;
-static int get_a_record(char **out, IOBUF *iop, int *errcode, const
awk_fieldwidth_info_t **field_width);
+static int get_a_record(char **out, size_t *len, IOBUF *iop, int *errcode,
const awk_fieldwidth_info_t **field_width);
static void free_rp(struct redirect *rp);
@@ -585,21 +585,19 @@ bool
inrec(IOBUF *iop, int *errcode)
{
char *begin;
- int cnt;
- bool retval = true;
+ size_t cnt;
+ bool retval;
const awk_fieldwidth_info_t *field_width = NULL;
if (at_eof(iop) && no_data_left(iop))
- cnt = EOF;
+ retval = false;
else if ((iop->flag & IOP_CLOSED) != 0)
- cnt = EOF;
+ retval = false;
else
- cnt = get_a_record(& begin, iop, errcode, & field_width);
+ /* Note that get_a_record may return -2 when I/O would block */
+ retval = (get_a_record(& begin, & cnt, iop, errcode, &
field_width) == 0);
- /* Note that get_a_record may return -2 when I/O would block */
- if (cnt < 0) {
- retval = false;
- } else {
+ if (retval) {
INCREMENT_REC(NR);
INCREMENT_REC(FNR);
set_record(begin, cnt, field_width);
@@ -2843,7 +2841,8 @@ do_getline_redir(int into_variable, enum redirval
redirtype)
{
struct redirect *rp = NULL;
IOBUF *iop;
- int cnt = EOF;
+ size_t cnt;
+ int retval = EOF;
char *s = NULL;
int errcode;
NODE *redir_exp = NULL;
@@ -2878,14 +2877,14 @@ do_getline_redir(int into_variable, enum redirval
redirtype)
return make_number((AWKNUM) 0.0);
errcode = 0;
- cnt = get_a_record(& s, iop, & errcode, (lhs ? NULL : & field_width));
+ retval = get_a_record(& s, & cnt, iop, & errcode, (lhs ? NULL : &
field_width));
if (errcode != 0) {
if (! do_traditional && (errcode != -1))
update_ERRNO_int(errcode);
- return make_number((AWKNUM) cnt);
+ return make_number((AWKNUM) retval);
}
- if (cnt == EOF) {
+ if (retval == EOF) {
/*
* Don't do iop_close() here if we are
* reading from a pipe; otherwise
@@ -2915,7 +2914,8 @@ do_getline_redir(int into_variable, enum redirval
redirtype)
NODE *
do_getline(int into_variable, IOBUF *iop)
{
- int cnt = EOF;
+ size_t cnt;
+ int retval = EOF;
char *s = NULL;
int errcode;
const awk_fieldwidth_info_t *field_width = NULL;
@@ -2927,16 +2927,16 @@ do_getline(int into_variable, IOBUF *iop)
}
errcode = 0;
- cnt = get_a_record(& s, iop, & errcode, (into_variable ? NULL : &
field_width));
+ retval = get_a_record(& s, & cnt, iop, & errcode, (into_variable ? NULL
: & field_width));
if (errcode != 0) {
if (! do_traditional && (errcode != -1))
update_ERRNO_int(errcode);
if (into_variable)
(void) POP_ADDRESS();
- return make_number((AWKNUM) cnt);
+ return make_number((AWKNUM) retval);
}
- if (cnt == EOF)
+ if (retval == EOF)
return NULL; /* try next file */
INCREMENT_REC(NR);
INCREMENT_REC(FNR);
@@ -3881,13 +3881,14 @@ errno_io_retry(void)
/*
* get_a_record --- read a record from IOP into out,
- * return length of EOF, set RT.
+ * its length into len, and set RT.
+ * return 0 on success, EOF when out of data, and -2 if I/O would block.
* Note that errcode is never NULL, and the caller initializes *errcode to 0.
- * If I/O would block, return -2.
*/
static int
get_a_record(char **out, /* pointer to pointer to data */
+ size_t *len, /* pointer to record length */
IOBUF *iop, /* input IOP */
int *errcode, /* pointer to error variable */
const awk_fieldwidth_info_t **field_width)
@@ -3896,7 +3897,6 @@ get_a_record(char **out, /* pointer to pointer to
data */
struct recmatch recm;
SCANSTATE state;
RECVALUE ret;
- int retval;
NODE *rtval = NULL;
static RECVALUE (*lastmatchrec)(IOBUF *iop, struct recmatch *recm,
SCANSTATE *state) = NULL;
@@ -3915,6 +3915,9 @@ get_a_record(char **out, /* pointer to pointer to
data */
if (rc == EOF)
iop->flag |= IOP_AT_EOF;
else {
+ assert(rc >= 0);
+ *len = rc;
+ rc = 0;
if (rt_len != 0)
set_RT(rt_start, rt_len);
else
@@ -4074,11 +4077,11 @@ get_a_record(char **out, /* pointer to pointer
to data */
if (recm.len == 0) {
*out = NULL;
- retval = 0;
+ *len = 0;
} else {
assert(recm.start != NULL);
*out = recm.start;
- retval = recm.len;
+ *len = recm.len;
}
iop->off += recm.len + recm.rt_len;
@@ -4086,7 +4089,7 @@ get_a_record(char **out, /* pointer to pointer to
data */
if (recm.len == 0 && recm.rt_len == 0 && at_eof(iop))
return EOF;
else
- return retval;
+ return 0;
}
/* set_RS --- update things as appropriate when RS is set */
On Tue, May 4, 2021, at 4:17 AM, arnold@skeeve.com wrote:
> Hi.
>
> Thanks for the report and patch.
>
> If you can produce a patch to allow records to be SIZE_T_MAX (or whatever
> the right constant is), I'd be interested.
>
> I will try to find some time to look into this also.
>
> Arnold
>
> "Miguel Pineiro Jr." <mpj@pineiro.cc> wrote:
>
> > Hello, gawk devs.
> >
> > gawk mishandles records longer than INT_MAX when get_a_record stuffs their
> > size_t length in an int (io.c:4081: `retval = recm.len`).
> >
> > All of the following examples are paired, first a success using a record of
> > length INT_MAX, then a failure using INT_MAX + 1.
> >
> >
> > In the main i/o loop, records vanish when their corrupted length is
> > negative, since inrec doesn't consider a negative value a valid record.
> >
> > $ gawk 'BEGIN {printf("%2147483647s\n", "a")}' | gawk 'END {print NR}'
> > 1
> > $ gawk 'BEGIN {printf("%2147483648s\n", "a")}' | gawk 'END {print NR}'
> > 0
> >
> >
> > In getline (do_getline/do_getline_redir), if the corrupted length is equal
> > to EOF, it will trigger a silent bypass of the rest of the file. More
> > likely, some other value will mislead buffer memory management routines and
> > crash gawk.
> >
> > This bare getline fails fatally in set_record's buffer resizing loop, when
> > it gives up trying to accomodate what it thinks is a humongous record
> > (field.c:284: `cnt >= databuf_size` promotes a negative int cnt to unsigned
> > long).
> >
> > $ gawk 'BEGIN {printf("\n%2147483647s\n", "a")}' | gawk '{getline} END
> > {print NR}'
> > 2
> > $ gawk 'BEGIN {printf("\n%2147483648s\n", "a")}' | gawk '{getline} END
> > {print NR}'
> > gawk: cmd. line:1: (FILENAME=- FNR=2) fatal: input record too large
> >
> >
> > This getline var dies in make_string (make_str_node) from a corrupted
> > allocation request:
> >
> > $ gawk 'BEGIN {printf("\n%2147483647s\n", "a")}' | gawk '{getline var} END
> > {print NR}'
> > 2
> > $ gawk 'BEGIN {printf("\n%2147483648s\n", "a")}' | gawk '{getline var} END
> > {print NR}'
> > gawk: cmd. line:1: (FILENAME=- FNR=2) fatal: node.c:415:make_str_node:
> > r->stptr: cannot allocate -2147483647 bytes of memory: Cannot allocate
> > memory
> >
> >
> > If INT_MAX is deemed sufficient, despite the use of capacious size_t i/o
> > buffers, here's a diff.
> >
> > diff --git a/io.c b/io.c
> > index 91c94d9b..4e777d75 100644
> > --- a/io.c
> > +++ b/io.c
> > @@ -4026,6 +4026,9 @@ get_a_record(char **out, /* pointer to pointer
> > to data */
> > iop->dataend += iop->count;
> > }
> >
> > + if (recm.len > INT_MAX)
> > + fatal(_("input record length too large to return"));
> > +
> > /* set record, RT, return right value */
> >
> > /*
>