[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#65726: 29.1.50; Crash in regexp engine
From: |
Stefan Monnier |
Subject: |
bug#65726: 29.1.50; Crash in regexp engine |
Date: |
Sun, 17 Sep 2023 22:14:37 -0400 |
User-agent: |
Gnus/5.13 (Gnus v5.13) |
> I'm reworking my code so it takes 2 arguments: the loop-entry and the
> loop-exit. Any jump outside of those bounds should never happen so we
> can assert it to check that our assumptions are right (and we can
> return false when we don't compile assertions, so it's never over-optimistic).
Hmm... not quite working yet. With the patch below I get:
% make test/src/regex-emacs-tests
[...]
passed 30/34 regexp-multibyte-unibyte (0.000264 sec)
0: /on_failure_jump_smart to 9
3: /exactn/1/x
6: /jump to 0
9: /start_memory/1
11: /on_failure_jump to 26
14: /on_failure_jump_smart to 23
17: /exactn/1/=
20: /jump to 14
23: /jump to 29
26: /exactn/1/h
29: /stop_memory/1
31: /on_failure_jump_loop to 37
34: /jump to 9
37: /succeed
38: end of pattern.
38 bytes used/128 bytes allocated.
re_nsub: 1 regs_alloc: 1 can_be_null: 0
p1==3, p2==34, loop-entry==14, loop-exit==26
Test regexp-tests-backtrack-optimization backtrace:
looking-at("x*\\(=*\\|h\\)+")
[...]
The problem is that my code sees the `jump 9` (near the end) followed by
`start_memory` and `on_failure_jump to 26` as a a backward jump that
defines a loop whose body starts at 14 and whose exit point is at 26,
but that 14..26 is not a loop, it's a `|` and those don't obey the
assumption I made about the exit point (the 2 destinations of such an
`on_failure_jump` correspond to the first and the second patterns of the
`|`, i.e. entry&middle rather than entry&exit, so there *can* be
jumps straight out from the first point without going through the second
point đ).
BTW, here's an indented version of the code:
0: /on_failure_jump_smart to 9 {
3: /exactn/1/x
6: } /jump to 0
9: { /start_memory/1
11: /on_failure_jump to 26
14: /on_failure_jump_smart to 23 {
17: /exactn/1/=
20: } /jump to 14
23: /jump to 29
26: /exactn/1/h
29: /stop_memory/1
31: /on_failure_jump_loop to 37
34: } /jump to 9
37: /succeed
38: end of pattern.
Another problem we see here is that the main (second) loop spans 9..37
and is controlled by the `/on_failure_jump_loop to 37` but the two
destination of the OFJ are not 9 and 37 but 34 and 37 đ.
So maybe we should `skip_noops` before looking at the destinations to
decide if it's a loop?
Stefan
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 55d0a6e8df8..c54f9f81890 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -3068,8 +3068,10 @@ analyze_first (re_char *p, re_char *pend, char *fastmap,
bool multibyte)
continue;
case succeed_n:
- /* If N == 0, it should be an on_failure_jump_loop instead. */
- DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j > 0));
+ /* If N == 0, it should be an on_failure_jump_loop instead.
+ `j` can be negative because `EXTRACT_NUMBER` extracts a
+ signed number whereas `succeed_n` treats it as unsigned. */
+ DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0));
p += 4;
/* We only care about one iteration of the loop, so we don't
need to consider the case where this behaves like an
@@ -3743,20 +3745,20 @@ mutually_exclusive_charset (struct re_pattern_buffer
*bufp, re_char *p1,
}
/* True if "p1 matches something" implies "p2 fails". */
-/* Avoiding inf-loops:
- We're trying to follow all paths reachable from `p2`, but since some
+/* We're trying to follow all paths reachable from `p2`, but since some
loops can match the empty string, this can loop back to `p2`.
- To avoid inf-looping, we keep track of points that have been considered
- "already". Instead of keeping a list of such points, `done_beg` and
- `done_end` delimit a chunk of bytecode we already considered.
- To guarantee termination, a lexical ordering between `done_*` and `p2`
- should be obeyed:
- At each recursion, either `done_beg` gets smaller,
- or `done_beg` is unchanged and `done_end` gets larger
- or they're both unchanged and `p2` gets larger. */
+ To avoid inf-looping, we take advantage of the fact that
+ the bytecode we generate is made of syntactically nested loops, more
+ specifically, every loop has a single entry point and single exit point.
+ The function takes 2 more arguments (`loop_entry` and `loop_exit`).
+ `loop_entry` points to the sole entry point of the current loop and
+ `loop_exit` points to its sole exit point (when non-NULL).
+ The function can assume that `loop_exit` is "mutually exclusive".
+ Termination of recursive calls is ensured because either `loop_entry`
+ is larger, or it's equal but `p2` is larger. */
static bool
mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
- re_char *p2, re_char *done_beg, re_char *done_end)
+ re_char *p2, re_char *loop_entry, re_char *loop_exit)
{
re_opcode_t op2;
unsigned char *pend = bufp->buffer + bufp->used;
@@ -3765,8 +3767,23 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp,
re_char *p1,
eassert (p1 >= bufp->buffer && p1 < pend
&& p2 >= bufp->buffer && p2 <= pend);
- eassert (done_beg <= done_end);
- eassert (done_end <= p2);
+ if (p2 == loop_exit)
+ /* Presumably already checked elsewhere. */
+ return true;
+ eassert (loop_entry && p2 >= loop_entry);
+ /* eassert (!loop_exit || p2 < loop_exit); */
+ if (p2 < loop_entry)
+ return false;
+ if (loop_exit && p2 > loop_exit)
+ {
+ void print_compiled_pattern (struct re_pattern_buffer *bufp);
+ print_compiled_pattern (bufp);
+ fprintf (stderr, "p1==%d, p2==%d, loop-entry==%d, loop-exit==%d\n",
+ p1 - bufp->buffer, p2 - bufp->buffer,
+ loop_entry - bufp->buffer, loop_exit - bufp->buffer);
+ error ("WOW1!");
+ }
+ /* return false; */
/* Skip over open/close-group commands.
If what follows this loop is a ...+ construct,
@@ -3860,27 +3877,30 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp,
re_char *p1,
EXTRACT_NUMBER_AND_INCR (mcnt, p2);
re_char *p2_other = p2 + mcnt;
- /* When we jump backward we bump `done_end` up to `p3` under
- the assumption that any other position between `done_end`
- and `p3` is either:
- - checked by the other call to RECURSE.
- - not reachable from here (e.g. for positions before the
- `on_failure_jump`), or at least not without first
- jumping before `done_beg`.
- This should hold because our state machines are not arbitrary:
- they consists of syntaxically nested loops with limited
- control flow.
- FIXME: This can fail (i.e. return true when it shouldn't)
- if we start generating bytecode with a different shape,
- so maybe we should bite the bullet and replace done_beg/end
- with an actual list of positions we've already processed. */
-#define RECURSE(p3) \
- ((p3) < done_beg ? mutually_exclusive_aux (bufp, p1, p3, p3, p3) \
- : (p3) <= done_end ? true \
- : mutually_exclusive_aux (bufp, p1, p3, done_beg, \
- (p3) > p2_orig ? done_end : (p3)))
-
- return RECURSE (p2) && RECURSE (p2_other);
+ /* If we jump backward to the entry point of the current loop
+ it means it's a zero-length cycle through that loop, so
+ this cycle itself does not break mutual-exclusion.
+ If we jump backward to a new loop nested within the current one
+ then the `p2_other` points to the exit of that inner loop
+ and the other RECURSE will check what happens when we leave
+ the loop so we can focus on checking just the loop body,
+ so we pass new values for `loop-entry` and `loop_exit`.
+ In all other cases, we just recurse "normally": if it's before
+ `loop_entry` it's a bug that will be caught by checks at
+ the entrace of the function and otherwise it's a forward jump
+ which doesn't need any special treatment.
+ FIXME: This is failsafe (can't return true when it shouldn't)
+ but it could be too conservative if we start generating bytecode
+ with a different shape, so maybe we should bite the bullet and
+ replace done_beg/end with an actual list of positions we've
+ already processed. */
+#define RECURSE(p3, p3_other) \
+ ((p3) == loop_entry ? true \
+ : loop_entry < (p3) && (p3) < p2_orig ? \
+ mutually_exclusive_aux (bufp, p1, p3, p3, p3_other) \
+ : mutually_exclusive_aux (bufp, p1, p3, loop_entry, loop_exit))
+
+ return RECURSE (p2, p2_other) && RECURSE (p2_other, p2);
}
default:
@@ -3895,7 +3915,7 @@ #define RECURSE(p3)
\
mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
re_char *p2)
{
- return mutually_exclusive_aux (bufp, p1, p2, p2, p2);
+ return mutually_exclusive_aux (bufp, p1, p2, bufp->buffer, NULL);
}
/* Matching routines. */
- bug#65726: 29.1.50; Crash in regexp engine, (continued)
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/05
- bug#65726: 29.1.50; Crash in regexp engine, Mattias EngdegÄrd, 2023/09/06
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/09
- bug#65726: 29.1.50; Crash in regexp engine, Mattias EngdegÄrd, 2023/09/09
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/14
- bug#65726: 29.1.50; Crash in regexp engine, Mattias EngdegÄrd, 2023/09/15
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/15
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/15
- bug#65726: 29.1.50; Crash in regexp engine, Mattias EngdegÄrd, 2023/09/16
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/16
- bug#65726: 29.1.50; Crash in regexp engine,
Stefan Monnier <=
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/17
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/18
- bug#65726: 29.1.50; Crash in regexp engine, Mattias EngdegÄrd, 2023/09/21
- bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/21
- bug#65726: 29.1.50; Crash in regexp engine, Mattias EngdegÄrd, 2023/09/23
bug#65726: 29.1.50; Crash in regexp engine, Stefan Monnier, 2023/09/04