bug-gnu-emacs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#65726: 29.1.50; Crash in regexp engine


From: Stefan Monnier
Subject: bug#65726: 29.1.50; Crash in regexp engine
Date: Sun, 17 Sep 2023 22:14:37 -0400
User-agent: Gnus/5.13 (Gnus v5.13)

> I'm reworking my code so it takes 2 arguments: the loop-entry and the
> loop-exit.  Any jump outside of those bounds should never happen so we
> can assert it to check that our assumptions are right (and we can
> return false when we don't compile assertions, so it's never over-optimistic).

Hmm... not quite working yet.  With the patch below I get:

    % make test/src/regex-emacs-tests
    [...]
    passed  30/34  regexp-multibyte-unibyte (0.000264 sec)
    0:      /on_failure_jump_smart to 9
    3:      /exactn/1/x
    6:      /jump to 0
    9:      /start_memory/1
    11:     /on_failure_jump to 26
    14:     /on_failure_jump_smart to 23
    17:     /exactn/1/=
    20:     /jump to 14
    23:     /jump to 29
    26:     /exactn/1/h
    29:     /stop_memory/1
    31:     /on_failure_jump_loop to 37
    34:     /jump to 9
    37:     /succeed
    38:     end of pattern.
    38 bytes used/128 bytes allocated.
    re_nsub: 1      regs_alloc: 1   can_be_null: 0
    p1==3, p2==34, loop-entry==14, loop-exit==26
    Test regexp-tests-backtrack-optimization backtrace:
      looking-at("x*\\(=*\\|h\\)+")
    [...]

The problem is that my code sees the `jump 9` (near the end) followed by
`start_memory` and `on_failure_jump to 26` as a a backward jump that
defines a loop whose body starts at 14 and whose exit point is at 26,
but that 14..26 is not a loop, it's a `|` and those don't obey the
assumption I made about the exit point (the 2 destinations of such an
`on_failure_jump` correspond to the first and the second patterns of the
`|`, i.e. entry&middle rather than entry&exit, so there *can* be
jumps straight out from the first point without going through the second
point 🙁).

BTW, here's an indented version of the code:

    0:      /on_failure_jump_smart to 9 {
    3:        /exactn/1/x
    6:      } /jump to 0
    9:      { /start_memory/1
    11:       /on_failure_jump to 26
    14:       /on_failure_jump_smart to 23 {
    17:         /exactn/1/=
    20:       } /jump to 14
    23:       /jump to 29
    26:       /exactn/1/h
    29:       /stop_memory/1
    31:       /on_failure_jump_loop to 37
    34:     } /jump to 9
    37:     /succeed
    38:     end of pattern.

Another problem we see here is that the main (second) loop spans 9..37
and is controlled by the `/on_failure_jump_loop to 37` but the two
destination of the OFJ are not 9 and 37 but 34 and 37 🙂.

So maybe we should `skip_noops` before looking at the destinations to
decide if it's a loop?


        Stefan


diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 55d0a6e8df8..c54f9f81890 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -3068,8 +3068,10 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, 
bool multibyte)
          continue;
 
        case succeed_n:
-         /* If N == 0, it should be an on_failure_jump_loop instead.  */
-         DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j > 0));
+         /* If N == 0, it should be an on_failure_jump_loop instead.
+            `j` can be negative because `EXTRACT_NUMBER` extracts a
+            signed number whereas `succeed_n` treats it as unsigned.  */
+         DEBUG_STATEMENT (EXTRACT_NUMBER (j, p + 2); eassert (j != 0));
          p += 4;
          /* We only care about one iteration of the loop, so we don't
             need to consider the case where this behaves like an
@@ -3743,20 +3745,20 @@ mutually_exclusive_charset (struct re_pattern_buffer 
*bufp, re_char *p1,
 }
 
 /* True if "p1 matches something" implies "p2 fails".  */
-/* Avoiding inf-loops:
-   We're trying to follow all paths reachable from `p2`, but since some
+/* We're trying to follow all paths reachable from `p2`, but since some
    loops can match the empty string, this can loop back to `p2`.
-   To avoid inf-looping, we keep track of points that have been considered
-   "already".  Instead of keeping a list of such points, `done_beg` and
-   `done_end` delimit a chunk of bytecode we already considered.
-   To guarantee termination, a lexical ordering between `done_*` and `p2`
-   should be obeyed:
-       At each recursion, either `done_beg` gets smaller,
-       or `done_beg` is unchanged and `done_end` gets larger
-       or they're both unchanged and `p2` gets larger.  */
+   To avoid inf-looping, we take advantage of the fact that
+   the bytecode we generate is made of syntactically nested loops, more
+   specifically, every loop has a single entry point and single exit point.
+   The function takes 2 more arguments (`loop_entry` and `loop_exit`).
+   `loop_entry` points to the sole entry point of the current loop and
+   `loop_exit` points to its sole exit point (when non-NULL).
+   The function can assume that `loop_exit` is "mutually exclusive".
+   Termination of recursive calls is ensured because either `loop_entry`
+   is larger, or it's equal but `p2` is larger.  */
 static bool
 mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
-                       re_char *p2, re_char *done_beg, re_char *done_end)
+                       re_char *p2, re_char *loop_entry, re_char *loop_exit)
 {
   re_opcode_t op2;
   unsigned char *pend = bufp->buffer + bufp->used;
@@ -3765,8 +3767,23 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, 
re_char *p1,
   eassert (p1 >= bufp->buffer && p1 < pend
           && p2 >= bufp->buffer && p2 <= pend);
 
-  eassert (done_beg <= done_end);
-  eassert (done_end <= p2);
+  if (p2 == loop_exit)
+    /* Presumably already checked elsewhere.  */
+    return true;
+  eassert (loop_entry && p2 >= loop_entry);
+  /* eassert (!loop_exit  || p2 < loop_exit); */
+  if (p2 < loop_entry)
+    return false;
+  if (loop_exit && p2 > loop_exit)
+    {
+      void print_compiled_pattern (struct re_pattern_buffer *bufp);
+      print_compiled_pattern (bufp);
+      fprintf (stderr, "p1==%d, p2==%d, loop-entry==%d, loop-exit==%d\n",
+               p1 - bufp->buffer, p2 - bufp->buffer,
+               loop_entry - bufp->buffer, loop_exit - bufp->buffer);
+      error ("WOW1!");
+    }
+    /* return false; */
 
   /* Skip over open/close-group commands.
      If what follows this loop is a ...+ construct,
@@ -3860,27 +3877,30 @@ mutually_exclusive_aux (struct re_pattern_buffer *bufp, 
re_char *p1,
        EXTRACT_NUMBER_AND_INCR (mcnt, p2);
        re_char *p2_other = p2 + mcnt;
 
-       /* When we jump backward we bump `done_end` up to `p3` under
-          the assumption that any other position between `done_end`
-          and `p3` is either:
-           - checked by the other call to RECURSE.
-           - not reachable from here (e.g. for positions before the
-             `on_failure_jump`), or at least not without first
-             jumping before `done_beg`.
-           This should hold because our state machines are not arbitrary:
-           they consists of syntaxically nested loops with limited
-          control flow.
-          FIXME: This can fail (i.e. return true when it shouldn't)
-          if we start generating bytecode with a different shape,
-          so maybe we should bite the bullet and replace done_beg/end
-          with an actual list of positions we've already processed.  */
-#define RECURSE(p3)                                                \
-  ((p3) < done_beg ? mutually_exclusive_aux (bufp, p1, p3, p3, p3) \
-   : (p3) <= done_end ? true                                       \
-   : mutually_exclusive_aux (bufp, p1, p3, done_beg,               \
-                            (p3) > p2_orig ? done_end : (p3)))
-
-       return RECURSE (p2) && RECURSE (p2_other);
+       /* If we jump backward to the entry point of the current loop
+          it means it's a zero-length cycle through that loop, so
+          this cycle itself does not break mutual-exclusion.
+          If we jump backward to a new loop nested within the current one
+          then the `p2_other` points to the exit of that inner loop
+          and the other RECURSE will check what happens when we leave
+          the loop so we can focus on checking just the loop body,
+          so we pass new values for `loop-entry` and `loop_exit`.
+          In all other cases, we just recurse "normally": if it's before
+          `loop_entry` it's a bug that will be caught by checks at
+          the entrace of the function and otherwise it's a forward jump
+          which doesn't need any special treatment.
+          FIXME: This is failsafe (can't return true when it shouldn't)
+          but it could be too conservative if we start generating bytecode
+          with a different shape, so maybe we should bite the bullet and
+          replace done_beg/end with an actual list of positions we've
+          already processed.  */
+#define RECURSE(p3, p3_other)                            \
+  ((p3) == loop_entry ? true                             \
+   : loop_entry < (p3) && (p3) < p2_orig ?               \
+     mutually_exclusive_aux (bufp, p1, p3, p3, p3_other) \
+   : mutually_exclusive_aux (bufp, p1, p3, loop_entry, loop_exit))
+        
+       return RECURSE (p2, p2_other) && RECURSE (p2_other, p2);
       }
 
     default:
@@ -3895,7 +3915,7 @@ #define RECURSE(p3)                                       
         \
 mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
                      re_char *p2)
 {
-  return mutually_exclusive_aux (bufp, p1, p2, p2, p2);
+  return mutually_exclusive_aux (bufp, p1, p2, bufp->buffer, NULL);
 }
 
 /* Matching routines.  */

reply via email to

[Prev in Thread] Current Thread [Next in Thread]