grep-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v3.3-39-gc2ec762


From: Paul Eggert
Subject: grep branch, master, updated. v3.3-39-gc2ec762
Date: Sun, 22 Dec 2019 19:40:12 -0500 (EST)

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  c2ec762dbc132d3c4a727c8e2ecab2a7286729d6 (commit)
       via  abb7f4f2325f26f930ff59b702fe42568a8e81e7 (commit)
      from  cf09252295c554dd3eba4cdb8eb53911fb495f40 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=c2ec762dbc132d3c4a727c8e2ecab2a7286729d6


commit c2ec762dbc132d3c4a727c8e2ecab2a7286729d6
Author: Paul Eggert <address@hidden>
Date:   Sun Dec 22 16:39:09 2019 -0800

    grep: fix some bugs in pattern-grouping speedup
    
    This fixes some bugs in the previous commit,
    and should finish the fix for Bug#33249.
    * NEWS: Mention fix for Bug#33249.
    * src/dfasearch.c (possible_backrefs_in_pattern, regex_compile)
    (GEAcompile): In new code, prefer ptrdiff_t to size_t when either
    will do, since ptrdiff_t has better error checking.  At some point
    we should adjust the old code too.
    (possible_backrefs_in_pattern): Rename from
    find_backref_in_pattern.  New arg BS_SAFE.  All uses changed.
    Fix false negative if a multibyte character ends in a single
    '\\' byte, followed by the two bytes '\\', '1'.
    (regex_compile): Simplify.
    (GEAcompile): Avoid quadratic behavior when reallocating growing
    buffers.  Fix a couple of bugs in copying pattern data involving
    backreferences.  Fix another bug in copying pattern metadata
    involving backreferences, by removing the need to copy it.

diff --git a/NEWS b/NEWS
index 8eda865..718659c 100644
--- a/NEWS
+++ b/NEWS
@@ -21,6 +21,10 @@ GNU grep NEWS                                    -*- outline 
-*-
   output is /dev/null.
   [Bug#37716 introduced in grep 3.2]
 
+  A performance bug has been fixed when grep is given many patterns
+  each of which lack backreferences.
+  [Bug#33249 introduced in grep 2.5]
+
   A performance bug has been fixed for patterns like '01.2' that
   cause grep to reorder tokens internally.
   [Bug#34951 introduced in grep 3.2]
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 42d16dc..eb7732b 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -35,7 +35,7 @@ struct dfa_comp
   struct dfa *dfa;
 
   /* Regex compiled regexps. */
-  struct re_pattern_buffer* patterns;
+  struct re_pattern_buffer *patterns;
   size_t pcount;
   struct re_registers regs;
 
@@ -103,25 +103,49 @@ kwsmusts (struct dfa_comp *dc)
   dfamustfree (dm);
 }
 
-static bool
-find_backref_in_pattern (const char *keys, size_t len)
+/* Return true if KEYS, of length LEN, might contain a backreference.
+   Return false if KEYS cannot contain a backreference.
+   BS_SAFE is true of encodings where a backslash cannot appear as the
+   last byte of a multibyte character.  */
+static bool _GL_ATTRIBUTE_PURE
+possible_backrefs_in_pattern (char const *keys, ptrdiff_t len, bool bs_safe)
 {
-  for (; len; keys++, len--)
-    if (keys[0] == '\\')
-      {
-        if (1 <= keys[1] && keys[1] <= '9')
-          return true;
-
-        if (keys[1] == '\\')
-          keys++;
-      }
-
+  /* Normally a backslash, but in an unsafe encoding this is a a
+     non-char value so that the comparison below always fails, because
+     if there are two adjacent '\' bytes the first might the last byte
+     of a multibyte character.  */
+  int second_backslash = bs_safe ? '\\' : CHAR_MAX + 1;
+
+  /* This code can return true even if KEYS lacks a backreference, for
+     patterns like [\2], or for encodings where '\' appears as
+     the last byte of a multibyte character.  However, false alarms
+     should be rare and do not affect correctness.  */
+
+  /* Do not look for a backslash in the pattern's last byte, since it
+     can't be part of a backreference and this streamlines the code.  */
+  len--;
+
+  if (0 <= len)
+    {
+      char const *lim = keys + len;
+      for (char const *p = keys; (p = memchr (p, '\\', lim - p)); p++)
+        {
+          if ('1' <= p[1] && p[1] <= '9')
+            return true;
+          if (p[1] == second_backslash)
+            {
+              p++;
+              if (p == lim)
+                break;
+            }
+        }
+    }
   return false;
 }
 
 static bool
-regex_compile (struct dfa_comp *dc, const char *p, size_t len, size_t pcount,
-               size_t lineno, bool syntax_only)
+regex_compile (struct dfa_comp *dc, char const *p, ptrdiff_t len,
+               ptrdiff_t pcount, ptrdiff_t lineno, bool syntax_only)
 {
   struct re_pattern_buffer pat0;
   struct re_pattern_buffer *pat = syntax_only ? &pat0 : &dc->patterns[pcount];
@@ -129,29 +153,23 @@ regex_compile (struct dfa_comp *dc, const char *p, size_t 
len, size_t pcount,
   pat->allocated = 0;
 
   /* Do not use a fastmap with -i, to work around glibc Bug#20381.  */
-  pat->fastmap = (syntax_only || match_icase) ? NULL : xmalloc (UCHAR_MAX + 1);
+  pat->fastmap = syntax_only | match_icase ? NULL : xmalloc (UCHAR_MAX + 1);
 
   pat->translate = NULL;
 
   char const *err = re_compile_pattern (p, len, pat);
-
   if (!err)
     return true;
 
-  /* With patterns specified only on the command line, emit the bare
-     diagnostic.  Otherwise, include a filename:lineno: prefix.  */
-  size_t new_lineno;
-  const char *pat_filename;
-
-  if (lineno != (size_t) -1)
-    pat_filename = pattern_file_name (lineno + 1, &new_lineno);
-  else
-    pat_filename = "\0";
+  /* Emit a filename:lineno: prefix for patterns taken from files.  */
+  size_t pat_lineno = lineno;
+  char const *pat_filename
+    = lineno < 0 ? "\0" : pattern_file_name (lineno + 1, &pat_lineno);
 
   if (*pat_filename == '\0')
     error (0, 0, "%s", err);
   else
-    error (0, 0, "%s:%zu: %s", pat_filename, new_lineno, err);
+    error (0, 0, "%s:%zu: %s", pat_filename, pat_lineno, err);
 
   return false;
 }
@@ -169,6 +187,7 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits)
   re_set_syntax (syntax_bits);
   int dfaopts = eolbyte ? 0 : DFA_EOL_NUL;
   dfasyntax (dc->dfa, &localeinfo, syntax_bits, dfaopts);
+  bool bs_safe = !localeinfo.multibyte | localeinfo.using_utf8;
 
   /* For GNU regex, pass the patterns separately to detect errors like
      "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
@@ -177,12 +196,20 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits)
   char const *p = pattern;
   char const *patlim = pattern + size;
   bool compilation_failed = false;
-  size_t palloc = 0;
+
+  dc->patterns = xmalloc (sizeof *dc->patterns);
+  dc->patterns++;
+  dc->pcount = 0;
+  size_t palloc = 1;
 
   char const *prev = pattern;
+
+  /* Buffer containing backreference-free patterns.  */
   char *buf = NULL;
-  size_t buflen = 0;
-  size_t lineno = 0;
+  ptrdiff_t buflen = 0;
+  size_t bufalloc = 0;
+
+  ptrdiff_t lineno = 0;
 
   do
     {
@@ -196,18 +223,26 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits)
       else
         len = patlim - p;
 
-      bool backref = find_backref_in_pattern (p, len);
+      bool backref = possible_backrefs_in_pattern (p, len, bs_safe);
 
       if (backref && prev < p)
         {
-          len = p - prev;
-          buf = xrealloc (buf, (buflen + len) * sizeof *buf);
-          memcpy (buf + buflen, p, len);
-          buflen += len;
+          ptrdiff_t prevlen = p - prev;
+          while (bufalloc < buflen + prevlen)
+            buf = x2realloc (buf, &bufalloc);
+          memcpy (buf + buflen, prev, prevlen);
+          buflen += prevlen;
         }
 
-      if (palloc <= dc->pcount)
-        dc->patterns = x2nrealloc (dc->patterns, &palloc, sizeof 
*dc->patterns);
+      /* Ensure room for at least two more patterns.  The extra one is
+         for the regex_compile that may be executed after this loop
+         exits, and its (unused) slot is patterns[-1] until then.  */
+      while (palloc <= dc->pcount + 1)
+        {
+          dc->patterns = x2nrealloc (dc->patterns - 1, &palloc,
+                                     sizeof *dc->patterns);
+          dc->patterns++;
+        }
 
       if (!regex_compile (dc, p, len, dc->pcount, lineno, !backref))
         compilation_failed = true;
@@ -230,10 +265,10 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits)
     {
       if (pattern < prev)
         {
-          size_t len = patlim - prev;
-          buf = xrealloc (buf, (buflen + len) * sizeof *buf);
-          memcpy (buf + buflen, prev, len);
-          buflen += len;
+          ptrdiff_t prevlen = patlim - prev;
+          buf = xrealloc (buf, buflen + prevlen);
+          memcpy (buf + buflen, prev, prevlen);
+          buflen += prevlen;
         }
       else
         {
@@ -244,16 +279,12 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits)
 
   if (buf != NULL)
     {
-      dc->patterns = x2nrealloc (dc->patterns, &palloc, sizeof *dc->patterns);
-
-      for (size_t i = 0; i < dc->pcount; i++)
-        dc->patterns[i + 1] = dc->patterns[i];
+      dc->patterns--;
+      dc->pcount++;
 
       if (!regex_compile (dc, buf, buflen, 0, -1, false))
         abort ();
 
-      dc->pcount++;
-
       if (buf != pattern)
         free (buf);
     }

http://git.savannah.gnu.org/cgit/grep.git/commit/?id=abb7f4f2325f26f930ff59b702fe42568a8e81e7


commit c2ec762dbc132d3c4a727c8e2ecab2a7286729d6
Author: Paul Eggert <address@hidden>
Date:   Sun Dec 22 16:39:09 2019 -0800

    grep: fix some bugs in pattern-grouping speedup
    
    This fixes some bugs in the previous commit,
    and should finish the fix for Bug#33249.
    * NEWS: Mention fix for Bug#33249.
    * src/dfasearch.c (possible_backrefs_in_pattern, regex_compile)
    (GEAcompile): In new code, prefer ptrdiff_t to size_t when either
    will do, since ptrdiff_t has better error checking.  At some point
    we should adjust the old code too.
    (possible_backrefs_in_pattern): Rename from
    find_backref_in_pattern.  New arg BS_SAFE.  All uses changed.
    Fix false negative if a multibyte character ends in a single
    '\\' byte, followed by the two bytes '\\', '1'.
    (regex_compile): Simplify.
    (GEAcompile): Avoid quadratic behavior when reallocating growing
    buffers.  Fix a couple of bugs in copying pattern data involving
    backreferences.  Fix another bug in copying pattern metadata
    involving backreferences, by removing the need to copy it.

diff --git a/NEWS b/NEWS
index 8eda865..718659c 100644
--- a/NEWS
+++ b/NEWS
@@ -21,6 +21,10 @@ GNU grep NEWS                                    -*- outline 
-*-
   output is /dev/null.
   [Bug#37716 introduced in grep 3.2]
 
+  A performance bug has been fixed when grep is given many patterns
+  each of which lack backreferences.
+  [Bug#33249 introduced in grep 2.5]
+
   A performance bug has been fixed for patterns like '01.2' that
   cause grep to reorder tokens internally.
   [Bug#34951 introduced in grep 3.2]
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 42d16dc..eb7732b 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -35,7 +35,7 @@ struct dfa_comp
   struct dfa *dfa;
 
   /* Regex compiled regexps. */
-  struct re_pattern_buffer* patterns;
+  struct re_pattern_buffer *patterns;
   size_t pcount;
   struct re_registers regs;
 
@@ -103,25 +103,49 @@ kwsmusts (struct dfa_comp *dc)
   dfamustfree (dm);
 }
 
-static bool
-find_backref_in_pattern (const char *keys, size_t len)
+/* Return true if KEYS, of length LEN, might contain a backreference.
+   Return false if KEYS cannot contain a backreference.
+   BS_SAFE is true of encodings where a backslash cannot appear as the
+   last byte of a multibyte character.  */
+static bool _GL_ATTRIBUTE_PURE
+possible_backrefs_in_pattern (char const *keys, ptrdiff_t len, bool bs_safe)
 {
-  for (; len; keys++, len--)
-    if (keys[0] == '\\')
-      {
-        if (1 <= keys[1] && keys[1] <= '9')
-          return true;
-
-        if (keys[1] == '\\')
-          keys++;
-      }
-
+  /* Normally a backslash, but in an unsafe encoding this is a a
+     non-char value so that the comparison below always fails, because
+     if there are two adjacent '\' bytes the first might the last byte
+     of a multibyte character.  */
+  int second_backslash = bs_safe ? '\\' : CHAR_MAX + 1;
+
+  /* This code can return true even if KEYS lacks a backreference, for
+     patterns like [\2], or for encodings where '\' appears as
+     the last byte of a multibyte character.  However, false alarms
+     should be rare and do not affect correctness.  */
+
+  /* Do not look for a backslash in the pattern's last byte, since it
+     can't be part of a backreference and this streamlines the code.  */
+  len--;
+
+  if (0 <= len)
+    {
+      char const *lim = keys + len;
+      for (char const *p = keys; (p = memchr (p, '\\', lim - p)); p++)
+        {
+          if ('1' <= p[1] && p[1] <= '9')
+            return true;
+          if (p[1] == second_backslash)
+            {
+              p++;
+              if (p == lim)
+                break;
+            }
+        }
+    }
   return false;
 }
 
 static bool
-regex_compile (struct dfa_comp *dc, const char *p, size_t len, size_t pcount,
-               size_t lineno, bool syntax_only)
+regex_compile (struct dfa_comp *dc, char const *p, ptrdiff_t len,
+               ptrdiff_t pcount, ptrdiff_t lineno, bool syntax_only)
 {
   struct re_pattern_buffer pat0;
   struct re_pattern_buffer *pat = syntax_only ? &pat0 : &dc->patterns[pcount];
@@ -129,29 +153,23 @@ regex_compile (struct dfa_comp *dc, const char *p, size_t 
len, size_t pcount,
   pat->allocated = 0;
 
   /* Do not use a fastmap with -i, to work around glibc Bug#20381.  */
-  pat->fastmap = (syntax_only || match_icase) ? NULL : xmalloc (UCHAR_MAX + 1);
+  pat->fastmap = syntax_only | match_icase ? NULL : xmalloc (UCHAR_MAX + 1);
 
   pat->translate = NULL;
 
   char const *err = re_compile_pattern (p, len, pat);
-
   if (!err)
     return true;
 
-  /* With patterns specified only on the command line, emit the bare
-     diagnostic.  Otherwise, include a filename:lineno: prefix.  */
-  size_t new_lineno;
-  const char *pat_filename;
-
-  if (lineno != (size_t) -1)
-    pat_filename = pattern_file_name (lineno + 1, &new_lineno);
-  else
-    pat_filename = "\0";
+  /* Emit a filename:lineno: prefix for patterns taken from files.  */
+  size_t pat_lineno = lineno;
+  char const *pat_filename
+    = lineno < 0 ? "\0" : pattern_file_name (lineno + 1, &pat_lineno);
 
   if (*pat_filename == '\0')
     error (0, 0, "%s", err);
   else
-    error (0, 0, "%s:%zu: %s", pat_filename, new_lineno, err);
+    error (0, 0, "%s:%zu: %s", pat_filename, pat_lineno, err);
 
   return false;
 }
@@ -169,6 +187,7 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits)
   re_set_syntax (syntax_bits);
   int dfaopts = eolbyte ? 0 : DFA_EOL_NUL;
   dfasyntax (dc->dfa, &localeinfo, syntax_bits, dfaopts);
+  bool bs_safe = !localeinfo.multibyte | localeinfo.using_utf8;
 
   /* For GNU regex, pass the patterns separately to detect errors like
      "[\nallo\n]\n", where the patterns are "[", "allo" and "]", and
@@ -177,12 +196,20 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits)
   char const *p = pattern;
   char const *patlim = pattern + size;
   bool compilation_failed = false;
-  size_t palloc = 0;
+
+  dc->patterns = xmalloc (sizeof *dc->patterns);
+  dc->patterns++;
+  dc->pcount = 0;
+  size_t palloc = 1;
 
   char const *prev = pattern;
+
+  /* Buffer containing backreference-free patterns.  */
   char *buf = NULL;
-  size_t buflen = 0;
-  size_t lineno = 0;
+  ptrdiff_t buflen = 0;
+  size_t bufalloc = 0;
+
+  ptrdiff_t lineno = 0;
 
   do
     {
@@ -196,18 +223,26 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits)
       else
         len = patlim - p;
 
-      bool backref = find_backref_in_pattern (p, len);
+      bool backref = possible_backrefs_in_pattern (p, len, bs_safe);
 
       if (backref && prev < p)
         {
-          len = p - prev;
-          buf = xrealloc (buf, (buflen + len) * sizeof *buf);
-          memcpy (buf + buflen, p, len);
-          buflen += len;
+          ptrdiff_t prevlen = p - prev;
+          while (bufalloc < buflen + prevlen)
+            buf = x2realloc (buf, &bufalloc);
+          memcpy (buf + buflen, prev, prevlen);
+          buflen += prevlen;
         }
 
-      if (palloc <= dc->pcount)
-        dc->patterns = x2nrealloc (dc->patterns, &palloc, sizeof 
*dc->patterns);
+      /* Ensure room for at least two more patterns.  The extra one is
+         for the regex_compile that may be executed after this loop
+         exits, and its (unused) slot is patterns[-1] until then.  */
+      while (palloc <= dc->pcount + 1)
+        {
+          dc->patterns = x2nrealloc (dc->patterns - 1, &palloc,
+                                     sizeof *dc->patterns);
+          dc->patterns++;
+        }
 
       if (!regex_compile (dc, p, len, dc->pcount, lineno, !backref))
         compilation_failed = true;
@@ -230,10 +265,10 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits)
     {
       if (pattern < prev)
         {
-          size_t len = patlim - prev;
-          buf = xrealloc (buf, (buflen + len) * sizeof *buf);
-          memcpy (buf + buflen, prev, len);
-          buflen += len;
+          ptrdiff_t prevlen = patlim - prev;
+          buf = xrealloc (buf, buflen + prevlen);
+          memcpy (buf + buflen, prev, prevlen);
+          buflen += prevlen;
         }
       else
         {
@@ -244,16 +279,12 @@ GEAcompile (char *pattern, size_t size, reg_syntax_t 
syntax_bits)
 
   if (buf != NULL)
     {
-      dc->patterns = x2nrealloc (dc->patterns, &palloc, sizeof *dc->patterns);
-
-      for (size_t i = 0; i < dc->pcount; i++)
-        dc->patterns[i + 1] = dc->patterns[i];
+      dc->patterns--;
+      dc->pcount++;
 
       if (!regex_compile (dc, buf, buflen, 0, -1, false))
         abort ();
 
-      dc->pcount++;
-
       if (buf != pattern)
         free (buf);
     }

-----------------------------------------------------------------------

Summary of changes:
 NEWS            |   4 ++
 src/dfasearch.c | 166 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 146 insertions(+), 24 deletions(-)


hooks/post-receive
-- 
grep



reply via email to

[Prev in Thread] Current Thread [Next in Thread]