bug-gnu-emacs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#65726: 29.1.50; Crash in regexp engine


From: Stefan Monnier
Subject: bug#65726: 29.1.50; Crash in regexp engine
Date: Sat, 09 Sep 2023 11:55:28 -0400
User-agent: Gnus/5.13 (Gnus v5.13)

>> Adding an `assert (p2 >= done)` confirms that this does happen,
>
> Can you give an example of such a regexp? I ran `make check` with your patch
> applied and checking enabled but got no assertion failure.

My patch didn't include the assertion :-)
The patch below does and hits an assertion failure during the dump:

    Loading format...
    Loading bindings...
    WOW1!
    
    Error: error ("WOW1!")
      mapbacktrace(#[1028 
"\1\4\203\24\0\301\302!\210\300\4!\210\301\303!\210\202\35\0\301\304!\210\3\3B\262\1\211\2035\0\300\1@!\210\211A\211\262\2\2035\0\301\305!\210\202!\0\301\306!\207"
 [prin1 princ "  " "(" "  (" " " ")\n"] 7 "\n\n(fn EVALD FUNC ARGS FLAGS)"])
      debug-early-backtrace()
      debug-early(error (error "WOW1!"))
      string-match("\\`[^ ]+\\( [^ ]+\\)*\\'" "h r" nil t)
      key-valid-p("h r")
      keymap--check("h r")
      keymap-set((keymap (27 keymap (119 . eww-search-words)) (111 . occur)) "h 
r" highlight-regexp)
      define-keymap("o" occur "M-w" eww-search-words "h r" highlight-regexp "h 
p" highlight-phrase "h l" highlight-lines-matching-regexp "h ." 
highlight-symbol-at-point "h u" unhighlight-regexp "h f" hi-lock-find-patterns 
"h w" hi-lock-write-interactive-patterns)
      (defvar search-map (define-keymap "o" 'occur "M-w" 'eww-search-words "h 
r" 'highlight-regexp "h p" 'highlight-phrase "h l" 
'highlight-lines-matching-regexp "h ." 'highlight-symbol-at-point "h u" 
'unhighlight-regexp "h f" 'hi-lock-find-patterns "h w" 
'hi-lock-write-interactive-patterns) 
("/home/monnier/src/emacs/main/lisp/bindings.elc" . 40677))
      load("bindings")
      load("loadup.el")


-- Stefan


diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 394ba22e9b0..b47cdfbfef8 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -3643,19 +3643,125 @@ execute_charset (re_char **pp, int c, int corig, bool 
unibyte,
   return not;
 }
 
+/* Case where `p2` points to an `exactn`.  */
+static bool
+mutually_exclusive_exactn (struct re_pattern_buffer *bufp, re_char *p1,
+                          re_char *p2)
+{
+  bool multibyte = RE_MULTIBYTE_P (bufp);
+  int c
+    = (re_opcode_t) *p2 == endline ? '\n'
+      : RE_STRING_CHAR (p2 + 2, multibyte);
+
+  if ((re_opcode_t) *p1 == exactn)
+    {
+      if (c != RE_STRING_CHAR (p1 + 2, multibyte))
+       {
+         DEBUG_PRINT ("  '%c' != '%c' => fast loop.\n", c, p1[2]);
+         return true;
+       }
+    }
+
+  else if ((re_opcode_t) *p1 == charset
+          || (re_opcode_t) *p1 == charset_not)
+    {
+      if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
+                            Qnil))
+       {
+         DEBUG_PRINT ("         No match => fast loop.\n");
+         return true;
+       }
+    }
+  else if ((re_opcode_t) *p1 == anychar
+          && c == '\n')
+    {
+      DEBUG_PRINT ("   . != \\n => fast loop.\n");
+      return true;
+    }
+  return false;
+}
+
+/* Case where `p2` points to an `charset`.  */
+static bool
+mutually_exclusive_charset (struct re_pattern_buffer *bufp, re_char *p1,
+                           re_char *p2)
+{
+  /* It is hard to list up all the character in charset
+     P2 if it includes multibyte character.  Give up in
+     such case.  */
+  if (!RE_MULTIBYTE_P (bufp) || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
+    {
+      /* Now, we are sure that P2 has no range table.
+            So, for the size of bitmap in P2, 'p2[1]' is
+            enough.  But P1 may have range table, so the
+            size of bitmap table of P1 is extracted by
+            using macro 'CHARSET_BITMAP_SIZE'.
+
+            In a multibyte case, we know that all the character
+            listed in P2 is ASCII.  In a unibyte case, P1 has only a
+            bitmap table.  So, in both cases, it is enough to test
+            only the bitmap table of P1.  */
+
+      if ((re_opcode_t) *p1 == charset)
+       {
+         int idx;
+         /* We win if the charset inside the loop
+                has no overlap with the one after the loop.  */
+         for (idx = 0;
+               (idx < (int) p2[1]
+                && idx < CHARSET_BITMAP_SIZE (p1));
+               idx++)
+           if ((p2[2 + idx] & p1[2 + idx]) != 0)
+             break;
+
+         if (idx == p2[1]
+             || idx == CHARSET_BITMAP_SIZE (p1))
+           {
+             DEBUG_PRINT ("     No match => fast loop.\n");
+             return true;
+           }
+       }
+      else if ((re_opcode_t) *p1 == charset_not)
+       {
+         int idx;
+         /* We win if the charset_not inside the loop lists
+                every character listed in the charset after.  */
+         for (idx = 0; idx < (int) p2[1]; idx++)
+           if (! (p2[2 + idx] == 0
+                  || (idx < CHARSET_BITMAP_SIZE (p1)
+                      && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
+             break;
+
+         if (idx == p2[1])
+           {
+             DEBUG_PRINT ("     No match => fast loop.\n");
+             return true;
+           }
+       }
+    }
+  return false;
+}
+
 /* True if "p1 matches something" implies "p2 fails".  */
+/* Avoiding inf-loops:
+   We're trying to follow all paths reachable from `p2`, but since some
+   loops can match the empty string, this can loop back to `p2`.
+   To avoid inf-looping, we rely on a lexicographic ordering using `done`
+   and `p2`: at every recursion, either `done` is larger, or it
+   is unchanged and `p2` is larger.  */
 static bool
-mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
-                     re_char *p2)
+mutually_exclusive_aux (struct re_pattern_buffer *bufp, re_char *p1,
+                       re_char *p2, re_char *done)
 {
   re_opcode_t op2;
-  bool multibyte = RE_MULTIBYTE_P (bufp);
   unsigned char *pend = bufp->buffer + bufp->used;
   re_char *p2_orig = p2;
 
   eassert (p1 >= bufp->buffer && p1 < pend
           && p2 >= bufp->buffer && p2 <= pend);
 
+  eassert (p2 >= done);
+
   /* Skip over open/close-group commands.
      If what follows this loop is a ...+ construct,
      look at what begins its body, since we will have to
@@ -3684,98 +3790,14 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, 
re_char *p1,
 
     case endline:
     case exactn:
-      {
-       int c
-         = (re_opcode_t) *p2 == endline ? '\n'
-         : RE_STRING_CHAR (p2 + 2, multibyte);
-
-       if ((re_opcode_t) *p1 == exactn)
-         {
-           if (c != RE_STRING_CHAR (p1 + 2, multibyte))
-             {
-               DEBUG_PRINT ("  '%c' != '%c' => fast loop.\n", c, p1[2]);
-               return true;
-             }
-         }
-
-       else if ((re_opcode_t) *p1 == charset
-                || (re_opcode_t) *p1 == charset_not)
-         {
-           if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
-                                  Qnil))
-             {
-               DEBUG_PRINT ("   No match => fast loop.\n");
-               return true;
-             }
-         }
-       else if ((re_opcode_t) *p1 == anychar
-                && c == '\n')
-         {
-           DEBUG_PRINT ("   . != \\n => fast loop.\n");
-           return true;
-         }
-      }
-      break;
+      return mutually_exclusive_exactn (bufp, p1, p2);
 
     case charset:
       {
        if ((re_opcode_t) *p1 == exactn)
-         /* Reuse the code above.  */
-         return mutually_exclusive_p (bufp, p2, p1);
-
-      /* It is hard to list up all the character in charset
-        P2 if it includes multibyte character.  Give up in
-        such case.  */
-      else if (!multibyte || !CHARSET_RANGE_TABLE_EXISTS_P (p2))
-       {
-         /* Now, we are sure that P2 has no range table.
-            So, for the size of bitmap in P2, 'p2[1]' is
-            enough.  But P1 may have range table, so the
-            size of bitmap table of P1 is extracted by
-            using macro 'CHARSET_BITMAP_SIZE'.
-
-            In a multibyte case, we know that all the character
-            listed in P2 is ASCII.  In a unibyte case, P1 has only a
-            bitmap table.  So, in both cases, it is enough to test
-            only the bitmap table of P1.  */
-
-         if ((re_opcode_t) *p1 == charset)
-           {
-             int idx;
-             /* We win if the charset inside the loop
-                has no overlap with the one after the loop.  */
-             for (idx = 0;
-                  (idx < (int) p2[1]
-                   && idx < CHARSET_BITMAP_SIZE (p1));
-                  idx++)
-               if ((p2[2 + idx] & p1[2 + idx]) != 0)
-                 break;
-
-             if (idx == p2[1]
-                 || idx == CHARSET_BITMAP_SIZE (p1))
-               {
-                 DEBUG_PRINT ("         No match => fast loop.\n");
-                 return true;
-               }
-           }
-         else if ((re_opcode_t) *p1 == charset_not)
-           {
-             int idx;
-             /* We win if the charset_not inside the loop lists
-                every character listed in the charset after.  */
-             for (idx = 0; idx < (int) p2[1]; idx++)
-               if (! (p2[2 + idx] == 0
-                      || (idx < CHARSET_BITMAP_SIZE (p1)
-                          && ((p2[2 + idx] & ~ p1[2 + idx]) == 0))))
-                 break;
-
-             if (idx == p2[1])
-               {
-                 DEBUG_PRINT ("         No match => fast loop.\n");
-                 return true;
-               }
-             }
-         }
+         return mutually_exclusive_exactn (bufp, p2, p1);
+       else
+         return mutually_exclusive_charset (bufp, p1, p2);
       }
       break;
 
@@ -3783,9 +3805,9 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, 
re_char *p1,
       switch (*p1)
        {
        case exactn:
+         return mutually_exclusive_exactn (bufp, p2, p1);
        case charset:
-         /* Reuse the code above.  */
-         return mutually_exclusive_p (bufp, p2, p1);
+         return mutually_exclusive_charset (bufp, p2, p1);
        case charset_not:
          /* When we have two charset_not, it's very unlikely that
             they don't overlap.  The union of the two sets of excluded
@@ -3830,11 +3852,25 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, 
re_char *p1,
         int mcnt;
        p2++;
        EXTRACT_NUMBER_AND_INCR (mcnt, p2);
-       /* Don't just test `mcnt > 0` because non-greedy loops have
-          their test at the end with an unconditional jump at the start.  */
-       if (p2 > p2_orig && mcnt >= 0) /* Ensure forward progress.  */
-         return (mutually_exclusive_p (bufp, p1, p2)
-                 && mutually_exclusive_p (bufp, p1, p2 + mcnt));
+       re_char *p2_other = p2 + mcnt;
+       /* Presumably, any position up to `done` should be a position we
+          already considered elsewhere (because our state machines are not
+          arbitrary: they consists of syntaxically nested loops with limited
+          control flow), so we can consider those back jumps as mutually
+          exclusive here.  */
+       if (p2 < done)
+         error ("WOW1!");
+       if (p2_other < done)
+         error ("WOW2!");
+       return ((p2 < done ? true
+                : p2 == done ? true
+                : mutually_exclusive_aux (bufp, p1, p2,
+                                          p2 > p2_orig ? done : p2))
+               && (p2_other < done ? true
+                   : p2_other == done ? true
+                   : mutually_exclusive_aux (bufp, p1, p2_other,
+                                             p2_other > p2_orig
+                                             ? done : p2_other)));
        break;
       }
 
@@ -3846,6 +3882,12 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, 
re_char *p1,
   return false;
 }
 
+static bool
+mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
+                     re_char *p2)
+{
+  return mutually_exclusive_aux (bufp, p1, p2, min (p2, p1));
+}
 
 /* Matching routines.  */
 






reply via email to

[Prev in Thread] Current Thread [Next in Thread]