From 2fd319ae0e4047c5579809802b0851187c040e2a Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Wed, 10 Aug 2022 13:59:03 -0700
Subject: [PATCH 3/3] Improve integer overflow checking for formats

* awk.h: Include intprops.h here instead of builtin.c.
(UINTMAX_MAX, SIZE_MAX): Move here from builtin.c.
Simplify UINTMAX_MAX definiens.
(WCHAR_MAX, UINTMAX_WIDTH): New macros.
(get_number_ui): Remove; unused.
(get_number_si, get_number_uj):
Replace these macros with functions that check for integer overflow.
* builtin.c (TYPE_SIGNED, TYPE_MINIMUM, TYPE_MAXIMUM):
Remove; now defined by intprops.h.
(INTMAX_MIN): Remove; unused.
(UINTMAX_MAX, SIZE_MAX): Move to awk.h.
(format_tree): Last arg is int, not long.  All uses changed.
Check for int overflow when computing field width,
precision, arg number, or a wchar_t value.
Remove no-longer-needed casts to int.
* mpfr.c (GMP_NUM_BITS, mpz_to_uintmax) [HAVE_MPFR]:
New constant and function, copied from GNU Emacs.
(integer_overflow): New function.
(get_number_si, get_number_uj): Now functions, which check
for integer overflow.
---
 ChangeLog               | 22 +++++++++
 README_d/README.hacking |  8 ++--
 awk.h                   | 30 ++++++++-----
 builtin.c               | 54 ++++++++---------------
 mpfr.c                  | 98 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 162 insertions(+), 50 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index eb27aae3..a5248ec9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,27 @@
 2022-08-10  Paul Eggert  <eggert@cs.ucla.edu>
 
+	Improve integer overflow checking for formats
+	* awk.h: Include intprops.h here instead of builtin.c.
+	(UINTMAX_MAX, SIZE_MAX): Move here from builtin.c.
+	Simplify UINTMAX_MAX definiens.
+	(WCHAR_MAX, UINTMAX_WIDTH): New macros.
+	(get_number_ui): Remove; unused.
+	(get_number_si, get_number_uj):
+	Replace these macros with functions that check for integer overflow.
+	* builtin.c (TYPE_SIGNED, TYPE_MINIMUM, TYPE_MAXIMUM):
+	Remove; now defined by intprops.h.
+	(INTMAX_MIN): Remove; unused.
+	(UINTMAX_MAX, SIZE_MAX): Move to awk.h.
+	(format_tree): Last arg is int, not long.  All uses changed.
+	Check for int overflow when computing field width,
+	precision, arg number, or a wchar_t value.
+	Remove no-longer-needed casts to int.
+	* mpfr.c (GMP_NUM_BITS, mpz_to_uintmax) [HAVE_MPFR]:
+	New constant and function, copied from GNU Emacs.
+	(integer_overflow): New function.
+	(get_number_si, get_number_uj): Now functions, which check
+	for integer overflow.
+
 	Use _Noreturn a bit more
 	This helps avoid some warnings with GCC 12 in developer mode,
 	in future patches.
diff --git a/README_d/README.hacking b/README_d/README.hacking
index d179488c..63583b89 100644
--- a/README_d/README.hacking
+++ b/README_d/README.hacking
@@ -1,9 +1,9 @@
-* Use one of the following macros to access the value of a numeric NODE:
-	Macro                   Returned C type
+* Use one of the following to access the value of a numeric NODE:
+	Call			Returned C type
 	---------------------------------------
-	get_number_ui(n)        unsigned long
 	get_number_si(n)        long
-	get_number_d(n)		double	
+	get_number_sj(n)	intmax_t
 	get_number_uj(n)	uintmax_t
+	get_number_d(n)		double
 
 * Use iszero(n) to test if a numeric NODE is zero.
diff --git a/awk.h b/awk.h
index 50f6abe9..3a5c5af1 100644
--- a/awk.h
+++ b/awk.h
@@ -102,6 +102,22 @@ extern int errno;
 # include <stdint.h>
 #endif
 
+#include "intprops.h"
+
+#ifndef UINTMAX_MAX
+# define UINTMAX_MAX ((uintmax_t) -1)
+#endif
+#ifndef SIZE_MAX
+# define SIZE_MAX ((size_t) -1)
+#endif
+#ifndef WCHAR_MAX
+# define WCHAR_MAX TYPE_MAXIMUM (wchar_t)
+#endif
+
+#ifndef UINTMAX_WIDTH
+# define UINTMAX_WIDTH TYPE_WIDTH (uintmax_t)
+#endif
+
 /* ----------------- System dependencies (with more includes) -----------*/
 
 /* This section is the messiest one in the file, not a lot that can be done */
@@ -1321,14 +1337,8 @@ DEREF(NODE *r)
  (!((n)->flags & (MPFN|MPZN)) ? (dblval) : (((n)->flags & MPFN) ? (mpfrval) : (mpzval)))
 
 /* conversion to C types */
-#define get_number_ui(n)	numtype_choose((n), mpfr_get_ui((n)->mpg_numbr, ROUND_MODE), mpz_get_ui((n)->mpg_i), (unsigned long) (n)->numbr)
-
-#define get_number_si(n)	numtype_choose((n), mpfr_get_si((n)->mpg_numbr, ROUND_MODE), mpz_get_si((n)->mpg_i), (long) (n)->numbr)
-
 #define get_number_d(n)		numtype_choose((n), mpfr_get_d((n)->mpg_numbr, ROUND_MODE), mpz_get_d((n)->mpg_i), (double) (n)->numbr)
 
-#define get_number_uj(n)	numtype_choose((n), mpfr_get_uj((n)->mpg_numbr, ROUND_MODE), (uintmax_t) mpz_get_d((n)->mpg_i), (uintmax_t) (n)->numbr)
-
 #define is_zero(n)		numtype_choose((n), mpfr_zero_p((n)->mpg_numbr), (mpz_sgn((n)->mpg_i) == 0), ((n)->numbr == 0.0))
 
 #define IEEE_FMT(r, t)		(void) (do_ieee_fmt && format_ieee(r, t))
@@ -1339,10 +1349,7 @@ DEREF(NODE *r)
 #define is_mpg_integer(n)	(((n)->flags & MPZN) != 0)
 #define is_mpg_number(n)	(((n)->flags & (MPZN|MPFN)) != 0)
 #else
-#define get_number_ui(n)	(unsigned long) (n)->numbr
-#define get_number_si(n)	(long) (n)->numbr
 #define get_number_d(n)		(double) (n)->numbr
-#define get_number_uj(n)	(uintmax_t) (n)->numbr
 
 #define is_mpg_number(n)	0
 #define is_mpg_float(n)		0
@@ -1506,7 +1513,7 @@ extern NODE *do_sub(int nargs, unsigned int flags);
 extern NODE *call_sub(const char *name, int nargs);
 extern NODE *call_match(int nargs);
 extern NODE *call_split_func(const char *name, int nargs);
-extern NODE *format_tree(const char *, size_t, NODE **, long);
+extern NODE *format_tree(const char *, size_t, NODE **, int);
 extern NODE *do_lshift(int nargs);
 extern NODE *do_rshift(int nargs);
 extern NODE *do_and(int nargs);
@@ -1715,6 +1722,9 @@ extern void *mpfr_mem_alloc(size_t alloc_size);
 extern void *mpfr_mem_realloc(void *ptr, size_t old_size, size_t new_size);
 extern void mpfr_mem_free(void *ptr, size_t size);
 #endif
+extern _Noreturn void integer_overflow(void);
+extern uintmax_t get_number_uj(NODE *);
+extern long get_number_si(NODE *);
 /* msg.c */
 extern _Noreturn void gawk_exit(int status);
 extern _Noreturn void final_exit(int status) ATTRIBUTE_NORETURN;
diff --git a/builtin.c b/builtin.c
index 04fac763..b2fc6a3a 100644
--- a/builtin.c
+++ b/builtin.c
@@ -39,28 +39,6 @@
 # define CHAR_BIT 8
 #endif
 
-/* The extra casts work around common compiler bugs.  */
-#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
-/* Note:  these assume that negative integers are represented internally
-   via 2's complement, which is not mandated by C.  They also ignore the
-   fact that signed integer arithmetic overflow can trigger exceptions,
-   unlike unsigned which is guaranteed not to do so. */
-#define TYPE_MINIMUM(t) ((t) (TYPE_SIGNED (t) \
-			      ? ~ (uintmax_t) 0 << (sizeof (t) * CHAR_BIT - 1) \
-			      : 0))
-#define TYPE_MAXIMUM(t) ((t) (~ (t) 0 - TYPE_MINIMUM (t)))
-
-#ifndef INTMAX_MIN
-# define INTMAX_MIN TYPE_MINIMUM (intmax_t)
-#endif
-#ifndef UINTMAX_MAX
-# define UINTMAX_MAX TYPE_MAXIMUM (uintmax_t)
-#endif
-
-#ifndef SIZE_MAX	/* C99 constant, can't rely on it everywhere */
-#define SIZE_MAX ((size_t) -1)
-#endif
-
 #define DEFAULT_G_PRECISION 6
 
 static size_t mbc_byte_count(const char *ptr, size_t numchars);
@@ -707,7 +685,7 @@ format_tree(
 	const char *fmt_string,
 	size_t n0,
 	NODE **the_args,
-	long num_args)
+	int num_args)
 {
 /* copy 'l' bytes from 's' to 'obufout' checking for space in the process */
 /* difference of pointers should be of ptrdiff_t type, but let us be kind */
@@ -747,7 +725,7 @@ format_tree(
 	osiz += delta; \
 }
 
-	size_t cur_arg = 0;
+	int cur_arg = 0;
 	NODE *r = NULL;
 	int i, nc;
 	bool toofew = false;
@@ -757,10 +735,11 @@ format_tree(
 	const char *s0, *s1;
 	int cs1;
 	NODE *arg;
-	long fw, prec, argnum;
+	int fw, prec, argnum;
 	bool used_dollar;
 	bool lj, alt, have_prec, need_format;
-	long *cur = NULL;
+	int *cur = NULL;
+	long lval;
 	uintmax_t uval;
 	bool sgn;
 	int base;
@@ -1000,7 +979,7 @@ check_pos:
 					goto out;
 				}
 				if (argnum >= num_args) {
-					msg(_("fatal: argument index %ld greater than total number of supplied arguments"), argnum);
+					msg(_("fatal: argument index %d greater than total number of supplied arguments"), argnum);
 					goto out;
 				}
 			} else {
@@ -1041,9 +1020,12 @@ check_pos:
 				parse_next_arg();
 			}
 			(void) force_number(arg);
-			*cur = get_number_si(arg);
+			lval = get_number_si(arg);
+			if (INT_ADD_WRAPV(lval, 0, cur))
+				integer_overflow();
 			if (*cur < 0 && cur == &fw) {
-				*cur = -*cur;
+				if (INT_SUBTRACT_WRAPV(0, *cur, cur))
+					integer_overflow();
 				lj = true;
 			}
 			if (cur == &prec) {
@@ -1133,7 +1115,7 @@ check_pos:
 					memset(& mbs, 0, sizeof(mbs));
 
 					/* handle systems with too small wchar_t */
-					if (sizeof(wchar_t) < 4 && uval > 0xffff) {
+					if (WCHAR_MAX < uval) {
 						if (do_lint)
 							lintwarn(
 						_("[s]printf: value %g is too big for %%c format"),
@@ -1629,21 +1611,21 @@ mpf1:
 			case MP_INT_WITH_PREC:
 				sprintf(cp, "*.*Z%c", cs1);
 				while ((nc = mpfr_snprintf(obufout, ofre, cpbuf,
-					     (int) fw, (int) prec, zi)) >= (int) ofre)
+					     fw, prec, zi)) >= (int) ofre)
 					chksize(nc)
 				need_to_add_thousands = true;
 				break;
 			case MP_INT_WITHOUT_PREC:
 				sprintf(cp, "*Z%c", cs1);
 				while ((nc = mpfr_snprintf(obufout, ofre, cpbuf,
-					     (int) fw, zi)) >= (int) ofre)
+					     fw, zi)) >= (int) ofre)
 					chksize(nc)
 				need_to_add_thousands = true;
 				break;
 			case MP_FLOAT:
 				sprintf(cp, "*.*R*%c", cs1);
 				while ((nc = mpfr_snprintf(obufout, ofre, cpbuf,
-					     (int) fw, (int) prec, ROUND_MODE, mf)) >= (int) ofre)
+					     fw, prec, ROUND_MODE, mf)) >= (int) ofre)
 					chksize(nc)
 				break;
 #endif
@@ -1651,7 +1633,7 @@ mpf1:
 				if (have_prec || tolower(cs1) != 'a') {
 					sprintf(cp, "*.*%c", cs1);
 					while ((nc = snprintf(obufout, ofre, cpbuf,
-						     (int) fw, (int) prec,
+						     fw, prec,
 						     (double) tmpval)) >= (int) ofre)
 						chksize(nc)
 				} else {
@@ -1659,7 +1641,7 @@ mpf1:
 					// wasn't supplied by the user.
 					sprintf(cp, "*%c", cs1);
 					while ((nc = snprintf(obufout, ofre, cpbuf,
-						     (int) fw,
+						     fw,
 						     (double) tmpval)) >= (int) ofre)
 						chksize(nc)
 				}
@@ -2776,7 +2758,7 @@ do_match(int nargs)
 		dest = POP_PARAM();
 		if (dest->type != Node_var_array)
 			fatal(_("match: third argument is not an array"));
-		check_symtab_functab(dest, "match", 
+		check_symtab_functab(dest, "match",
 				_("%s: cannot use %s as third argument"));
 		assoc_clear(dest);
 	}
diff --git a/mpfr.c b/mpfr.c
index 21bac6cd..3fde64e3 100644
--- a/mpfr.c
+++ b/mpfr.c
@@ -2015,3 +2015,101 @@ mpfr_unset(NODE *n)
 	/* dummy function */
 }
 #endif
+
+
+#ifdef HAVE_MPFR
+
+/* The following is taken from GNU Emacs.  */
+
+/* Number of data bits in a limb.  */
+# ifndef GMP_NUMB_BITS
+enum { GMP_NUMB_BITS = TYPE_WIDTH (mp_limb_t) };
+# endif
+
+/* If Z fits into *PI, store its value there and return true.
+   Return false otherwise.  */
+static bool
+mpz_to_uintmax (mpz_t const z, uintmax_t *pi)
+{
+  if (mpz_sgn (z) < 0)
+    return false;
+  ptrdiff_t bits = mpz_sizeinbase (z, 2);
+  if (UINTMAX_WIDTH < bits)
+    return false;
+
+  uintmax_t v = 0;
+  int i = 0, shift = 0;
+
+  do
+    {
+      uintmax_t limb = mpz_getlimbn (z, i++);
+      v += limb << shift;
+      shift += GMP_NUMB_BITS;
+    }
+  while (shift < bits);
+
+  *pi = v;
+  return true;
+}
+#endif
+
+/* integer_overflow --- report a fatal integer overflow */
+
+void
+integer_overflow(void)
+{
+	fatal(_("integer overflow"));
+}
+
+/* get_number_si --- get a long */
+
+long
+get_number_si(NODE *n)
+{
+	double lo = LONG_MIN, hi = LONG_MAX;
+#ifdef HAVE_MPFR
+	if (n->flags & (MPFN | MPZN)) {
+		if (n->flags & MPFN) {
+			long r;
+			mpfr_clear_erangeflag ();
+			r = mpfr_get_si(n->mpg_numbr, ROUND_MODE);
+			if (mpfr_erangeflag_p())
+				integer_overflow();
+			return r;
+		} else {
+			if (! mpz_fits_slong_p(n->mpg_i))
+				integer_overflow();
+			return mpz_get_si(n->mpg_i);
+		}
+	}
+#endif
+	if (! (lo - 1 < n->numbr && n->numbr < hi + 1))
+		integer_overflow();
+	return n->numbr;
+}
+
+/* get_number_uj --- get an uintmax_t */
+
+uintmax_t
+get_number_uj(NODE *n)
+{
+	double hi = UINTMAX_MAX;
+#ifdef HAVE_MPFR
+	if (n->flags & (MPFN | MPZN)) {
+		uintmax_t r;
+		if (n->flags & MPFN) {
+			mpfr_clear_erangeflag ();
+			r = mpfr_get_uj(n->mpg_numbr, ROUND_MODE);
+			if (mpfr_erangeflag_p())
+				integer_overflow();
+		} else {
+			if (! mpz_to_uintmax(n->mpg_i, &r))
+				integer_overflow();
+		}
+		return r;
+	}
+#endif
+	if (! (-1 < n->numbr && n->numbr < hi + 1))
+		integer_overflow();
+	return n->numbr;
+}
-- 
2.37.1