[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#65726: 29.1.50; Crash in regexp engine
From: |
Stefan Monnier |
Subject: |
bug#65726: 29.1.50; Crash in regexp engine |
Date: |
Sat, 09 Sep 2023 11:55:28 -0400 |
User-agent: |
Gnus/5.13 (Gnus v5.13) |
>> Adding an `assert (p2 >= done)` confirms that this does happen,
>
> Can you give an example of such a regexp? I ran `make check` with your patch
> applied and checking enabled but got no assertion failure.
My patch didn't include the assertion :-)
The patch below does and hits an assertion failure during the dump:
Loading format...
Loading bindings...
WOW1!
Error: error ("WOW1!")
mapbacktrace(#[1028
"\1\4\203\24\0\301\302!\210\300\4!\210\301\303!\210\202\35\0\301\304!\210\3\3B\262\1\211\2035\0\300\1@!\210\211A\211\262\2\2035\0\301\305!\210\202!\0\301\306!\207"
[prin1 princ " " "(" " (" " " ")\n"] 7 "\n\n(fn EVALD FUNC ARGS FLAGS)"])
debug-early-backtrace()
debug-early(error (error "WOW1!"))
string-match("\\`[^ ]+\\( [^ ]+\\)*\\'" "h r" nil t)
key-valid-p("h r")
keymap--check("h r")
keymap-set((keymap (27 keymap (119 . eww-search-words)) (111 . occur)) "h
r" highlight-regexp)
define-keymap("o" occur "M-w" eww-search-words "h r" highlight-regexp "h
p" highlight-phrase "h l" highlight-lines-matching-regexp "h ."
highlight-symbol-at-point "h u" unhighlight-regexp "h f" hi-lock-find-patterns
"h w" hi-lock-write-interactive-patterns)
(defvar search-map (define-keymap "o" 'occur "M-w" 'eww-search-words "h
r" 'highlight-regexp "h p" 'highlight-phrase "h l"
'highlight-lines-matching-regexp "h ." 'highlight-symbol-at-point "h u"
'unhighlight-regexp "h f" 'hi-lock-find-patterns "h w"
'hi-lock-write-interactive-patterns)
("/home/monnier/src/emacs/main/lisp/bindings.elc" . 40677))
load("bindings")
load("loadup.el")
-- Stefan
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 394ba22e9b0..b47cdfbfef8 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -3643,19 +3643,125 @@ execute_charset (re_char **pp, int c, int corig, bool
unibyte,
return not;
}
+/* Case where `p2` points to an `exactn`. */
+static bool
+mutually_exclusive_exactn (struct re_pattern_buffer *bufp, re_char *p1,
+ re_char *p2)
+{
+ bool multibyte = RE_MULTIBYTE_P (bufp);
+ int c
+ = (re_opcode_t) *p2 == endline ? '\n'
+ : RE_STRING_CHAR (p2 + 2, multibyte);
+
+ if ((re_opcode_t) *p1 == exactn)
+ {
+ if (c != RE_STRING_CHAR (p1 + 2, multibyte))
+ {
+ DEBUG_PRINT (" '%c' != '%c' => fast loop.\n", c, p1[2]);
+ return true;
+ }
+ }
+
+ else if ((re_opcode_t) *p1 == charset
+ || (re_opcode_t) *p1 == charset_not)
+ {
+ if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
+ Qnil))
+ {
+ DEBUG_PRINT (" No match => fast loop.\n");
+ return true;
+ }
+ }
+ else if ((re_opcode_t) *p1 == anychar
+ && c == '\n')
+ {
+ DEBUG_PRINT (" . != \\n => fast loop.\n");
+ return true;
+ }
+ return false;
+}
+
+/* Case where `p2` points to an `charset`. */
+static bool
+mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1,
+ re_char *p2)
+{
+ /* It is hard to list up all the character in charset
+ P2 if it includes multibyte character. Give up in
+ such case. */
+ if (!RE_MULTIBYTE_P (bufp) || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
+ {
+ /* Now, we are sure that P2 has no range table.
+ So, for the size of bitmap in P2, 'p2[1]' is
+ enough. But P1 may have range table, so the
+ size of bitmap table of P1 is extracted by
+ using macro 'CHARSET_BITMAP_SIZE'.
+
+ In a multibyte case, we know that all the character
+ listed in P2 is ASCII. In a unibyte case, P1 has only a
+ bitmap table. So, in both cases, it is enough to test
+ only the bitmap table of P1. */
+
+ if ((re_opcode_t) *p1 == charset)
+ {
+ int idx;
+ /* We win if the charset inside the loop
+ has no overlap with the one after the loop. */
+ for (idx = 0;
+ (idx < (int) p2[1]
+ && idx < CHARSET_BITMAP_SIZE (p1));
+ idx++)
+ if ((p2[2 + idx] & p1[2 + idx]) != 0)
+ break;
+
+ if (idx == p2[1]
+ || idx == CHARSET_BITMAP_SIZE (p1))
+ {
+ DEBUG_PRINT (" No match => fast loop.\n");
+ return true;
+ }
+ }
+ else if ((re_opcode_t) *p1 == charset_not)
+ {
+ int idx;
+ /* We win if the charset_not inside the loop lists
+ every character listed in the charset after. */
+ for (idx = 0; idx < (int) p2[1]; idx++)
+ if (! (p2[2 + idx] == 0
+ || (idx < CHARSET_BITMAP_SIZE (p1)
+ && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
+ break;
+
+ if (idx == p2[1])
+ {
+ DEBUG_PRINT (" No match => fast loop.\n");
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
/* True if "p1 matches something" implies "p2 fails". */
+/* Avoiding inf-loops:
+ We're trying to follow all paths reachable from `p2`, but since some
+ loops can match the empty string, this can loop back to `p2`.
+ To avoid inf-looping, we rely on a lexicographic ordering using `done`
+ and `p2`: at every recursion, either `done` is larger, or it
+ is unchanged and `p2` is larger. */
static bool
-mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
- re_char *p2)
+mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
+ re_char *p2, re_char *done)
{
re_opcode_t op2;
- bool multibyte = RE_MULTIBYTE_P (bufp);
unsigned char *pend = bufp->buffer + bufp->used;
re_char *p2_orig = p2;
eassert (p1 >= bufp->buffer && p1 < pend
&& p2 >= bufp->buffer && p2 <= pend);
+ eassert (p2 >= done);
+
/* Skip over open/close-group commands.
If what follows this loop is a ...+ construct,
look at what begins its body, since we will have to
@@ -3684,98 +3790,14 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp,
re_char *p1,
case endline:
case exactn:
- {
- int c
- = (re_opcode_t) *p2 == endline ? '\n'
- : RE_STRING_CHAR (p2 + 2, multibyte);
-
- if ((re_opcode_t) *p1 == exactn)
- {
- if (c != RE_STRING_CHAR (p1 + 2, multibyte))
- {
- DEBUG_PRINT (" '%c' != '%c' => fast loop.\n", c, p1[2]);
- return true;
- }
- }
-
- else if ((re_opcode_t) *p1 == charset
- || (re_opcode_t) *p1 == charset_not)
- {
- if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
- Qnil))
- {
- DEBUG_PRINT (" No match => fast loop.\n");
- return true;
- }
- }
- else if ((re_opcode_t) *p1 == anychar
- && c == '\n')
- {
- DEBUG_PRINT (" . != \\n => fast loop.\n");
- return true;
- }
- }
- break;
+ return mutually_exclusive_exactn (bufp, p1, p2);
case charset:
{
if ((re_opcode_t) *p1 == exactn)
- /* Reuse the code above. */
- return mutually_exclusive_p (bufp, p2, p1);
-
- /* It is hard to list up all the character in charset
- P2 if it includes multibyte character. Give up in
- such case. */
- else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
- {
- /* Now, we are sure that P2 has no range table.
- So, for the size of bitmap in P2, 'p2[1]' is
- enough. But P1 may have range table, so the
- size of bitmap table of P1 is extracted by
- using macro 'CHARSET_BITMAP_SIZE'.
-
- In a multibyte case, we know that all the character
- listed in P2 is ASCII. In a unibyte case, P1 has only a
- bitmap table. So, in both cases, it is enough to test
- only the bitmap table of P1. */
-
- if ((re_opcode_t) *p1 == charset)
- {
- int idx;
- /* We win if the charset inside the loop
- has no overlap with the one after the loop. */
- for (idx = 0;
- (idx < (int) p2[1]
- && idx < CHARSET_BITMAP_SIZE (p1));
- idx++)
- if ((p2[2 + idx] & p1[2 + idx]) != 0)
- break;
-
- if (idx == p2[1]
- || idx == CHARSET_BITMAP_SIZE (p1))
- {
- DEBUG_PRINT (" No match => fast loop.\n");
- return true;
- }
- }
- else if ((re_opcode_t) *p1 == charset_not)
- {
- int idx;
- /* We win if the charset_not inside the loop lists
- every character listed in the charset after. */
- for (idx = 0; idx < (int) p2[1]; idx++)
- if (! (p2[2 + idx] == 0
- || (idx < CHARSET_BITMAP_SIZE (p1)
- && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
- break;
-
- if (idx == p2[1])
- {
- DEBUG_PRINT (" No match => fast loop.\n");
- return true;
- }
- }
- }
+ return mutually_exclusive_exactn (bufp, p2, p1);
+ else
+ return mutually_exclusive_charset (bufp, p1, p2);
}
break;
@@ -3783,9 +3805,9 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp,
re_char *p1,
switch (*p1)
{
case exactn:
+ return mutually_exclusive_exactn (bufp, p2, p1);
case charset:
- /* Reuse the code above. */
- return mutually_exclusive_p (bufp, p2, p1);
+ return mutually_exclusive_charset (bufp, p2, p1);
case charset_not:
/* When we have two charset_not, it's very unlikely that
they don't overlap. The union of the two sets of excluded
@@ -3830,11 +3852,25 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp,
re_char *p1,
int mcnt;
p2++;
EXTRACT_NUMBER_AND_INCR (mcnt, p2);
- /* Don't just test `mcnt > 0` because non-greedy loops have
- their test at the end with an unconditional jump at the start. */
- if (p2 > p2_orig && mcnt >= 0) /* Ensure forward progress. */
- return (mutually_exclusive_p (bufp, p1, p2)
- && mutually_exclusive_p (bufp, p1, p2 + mcnt));
+ re_char *p2_other = p2 + mcnt;
+ /* Presumably, any position up to `done` should be a position we
+ already considered elsewhere (because our state machines are not
+ arbitrary: they consists of syntaxically nested loops with limited
+ control flow), so we can consider those back jumps as mutually
+ exclusive here. */
+ if (p2 < done)
+ error ("WOW1!");
+ if (p2_other < done)
+ error ("WOW2!");
+ return ((p2 < done ? true
+ : p2 == done ? true
+ : mutually_exclusive_aux (bufp, p1, p2,
+ p2 > p2_orig ? done : p2))
+ && (p2_other < done ? true
+ : p2_other == done ? true
+ : mutually_exclusive_aux (bufp, p1, p2_other,
+ p2_other > p2_orig
+ ? done : p2_other)));
break;
}
@@ -3846,6 +3882,12 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp,
re_char *p1,
return false;
}
+static bool
+mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
+ re_char *p2)
+{
+ return mutually_exclusive_aux (bufp, p1, p2, min (p2, p1));
+}
/* Matching routines. */
- bug#65726: 29.1.50; Crash in regexp engine, (continued)
- bug#65726: 29.1.50; Crash in regexp engine, Eli Zaretskii, 2023/09/04
- bug#65726: 29.1.50; Crash in regexp engine, Mattias Engdegård, 2023/09/04
- bug#65726: 29.1.50; Crash in regexp engine, Mattias Engdegård, 2023/09/04
- bug#65726: 29.1.50; Crash in regexp engine, Eli Zaretskii, 2023/09/04
- bug#65726: 29.1.50; Crash in regexp engine, Mattias Engdegård, 2023/09/04
- bug#65726: 29.1.50; Crash in regexp engine, Mattias Engdegård, 2023/09/05
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/05
- bug#65726: 29.1.50; Crash in regexp engine, Mattias Engdegård, 2023/09/05
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/05
- bug#65726: 29.1.50; Crash in regexp engine, Mattias Engdegård, 2023/09/06
- bug#65726: 29.1.50; Crash in regexp engine,
Stefan Monnier <=
- bug#65726: 29.1.50; Crash in regexp engine, Mattias Engdegård, 2023/09/09
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/14
- bug#65726: 29.1.50; Crash in regexp engine, Mattias Engdegård, 2023/09/15
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/15
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/15
- bug#65726: 29.1.50; Crash in regexp engine, Mattias Engdegård, 2023/09/16
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/16
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/17
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/17
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/18