bug-wget
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 1/3] Support for multiple regex patterns for URL matching


From: Tim Rühsen
Subject: Re: [PATCH 1/3] Support for multiple regex patterns for URL matching
Date: Sun, 2 May 2021 14:57:34 +0200
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Thunderbird/78.10.0

Hi,

your patches look great, good work. And allowing multiple regexes seems to be a good idea to me.

Here comes the (small) but...

a) Wget is in maintenance mode - we try not to add any new features here; just bugs are fixed. New features (and I consider this a new feature due to b)) should go into Wget2[1] only.

b) This is a breaking change. Well, on the first glance it just extends an already existing feature. But I see corner cases where this might break existing production. E.g. scripts that rely on the behavior that the last regex on the command line overrides any previous would behave differently with your patch. Another example is when someone has a default regex in a config file and overrides it via command line.

Then there also open questions like what is the most flexible way in chaining regexes with (not|and|or) operators.

So if you consider rewriting your patches for Wget2, I'll be happy to support you and/or discuss this topic with you :-)

There is little "obstacle": for non-trivial work (>15 lines of code) to be accepted, you have to sign the FSF copyright assignment. For this, follow the instructions at [2].

I would love to hear from you.

Regards, Tim

[1] https://gitlab.com/gnuwget/wget2
[2] https://git.savannah.gnu.org/cgit/gnulib.git/tree/doc/Copyright/request-assign.future

On 01.05.21 08:39, Nekun wrote:
Wget allows to pass --{accept,reject}-regex multiple times, but
internally it has only single string for these options and only
the last given option has effect in result. It obviously may produce
very confusing behavior on recursive fetching. To resolve it,
add support for multiple regex matching similiarly with existing
--{accept,reject} logic.

* src/init.c: Replace str type by vector type for corresponding opts
* src/main.c: Create regex vector from string vector
* src/options.h: Change types of {accept,reject}regex{,_s} opts to vector
* src/utils.h: (accept_url): Search matches in regex vector
   (match_posix_regex): Pass regex in param to regex error handler
   (in_reglist) Add func for searching matches in regex vector
   (regex_vec_create) Add func for creating regex vector
* src/utils.h: Add regex_vec_create definition
---
  src/init.c    | 19 +++++++++++--------
  src/main.c    |  8 ++++----
  src/options.h |  8 ++++----
  src/utils.c   | 48 ++++++++++++++++++++++++++++++++++++++++++------
  src/utils.h   |  1 +
  5 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/src/init.c b/src/init.c
index 935584a0..0ad70f70 100644
--- a/src/init.c
+++ b/src/init.c
@@ -135,7 +135,7 @@ static const struct {
  } commands[] = {
    /* KEEP THIS LIST ALPHABETICALLY SORTED */
    { "accept",           &opt.accepts,           cmd_vector },
-  { "acceptregex",      &opt.acceptregex_s,     cmd_string },
+  { "acceptregex",      &opt.acceptregex_s,     cmd_vector },
    { "addhostdir",       &opt.add_hostdir,       cmd_boolean },
    { "adjustextension",  &opt.adjust_extension,  cmd_boolean },
    { "alwaysrest",       &opt.always_rest,       cmd_boolean }, /* deprecated 
*/
@@ -304,7 +304,7 @@ static const struct {
    { "regextype",        &opt.regex_type,        cmd_spec_regex_type },
    { "reject",           &opt.rejects,           cmd_vector },
    { "rejectedlog",      &opt.rejected_log,      cmd_file },
-  { "rejectregex",      &opt.rejectregex_s,     cmd_string },
+  { "rejectregex",      &opt.rejectregex_s,     cmd_vector },
    { "relativeonly",     &opt.relative_only,     cmd_boolean },
    { "remoteencoding",   &opt.encoding_remote,   cmd_string },
    { "removelisting",    &opt.remove_listing,    cmd_boolean },
@@ -1982,15 +1982,18 @@ cleanup (void)
    xfree (opt.default_page);
    if (opt.regex_type == regex_type_posix)
      {
+      int i;
        if (opt.acceptregex)
-        regfree (opt.acceptregex);
+       for (i = 0; opt.acceptregex[i]; i++)
+         regfree (opt.acceptregex[i]);
        if (opt.rejectregex)
-        regfree (opt.rejectregex);
+        for (i = 0; opt.rejectregex[i]; i++)
+         regfree (opt.rejectregex[i]);
      }
-  xfree (opt.acceptregex);
-  xfree (opt.rejectregex);
-  xfree (opt.acceptregex_s);
-  xfree (opt.rejectregex_s);
+  free_vec ((char **)opt.acceptregex);
+  free_vec ((char **)opt.rejectregex);
+  free_vec (opt.acceptregex_s);
+  free_vec (opt.rejectregex_s);
    free_vec (opt.accepts);
    free_vec (opt.rejects);
    free_vec ((char **)opt.excludes);
diff --git a/src/main.c b/src/main.c
index e506b211..db61640c 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1819,14 +1819,14 @@ for details.\n\n"));
      }
    if (opt.acceptregex_s)
      {
-      opt.acceptregex = opt.regex_compile_fun (opt.acceptregex_s);
-      if (!opt.acceptregex)
+      if (!regex_vec_create (&opt.acceptregex,
+          (const char **) opt.acceptregex_s))
          exit (WGET_EXIT_GENERIC_ERROR);
      }
    if (opt.rejectregex_s)
      {
-      opt.rejectregex = opt.regex_compile_fun (opt.rejectregex_s);
-      if (!opt.rejectregex)
+      if (!regex_vec_create (&opt.rejectregex,
+          (const char **) opt.rejectregex_s))
          exit (WGET_EXIT_GENERIC_ERROR);
      }
    if (opt.post_data || opt.post_file_name)
diff --git a/src/options.h b/src/options.h
index a0e9c60d..e701142c 100644
--- a/src/options.h
+++ b/src/options.h
@@ -88,10 +88,10 @@ struct options
    bool ignore_case;             /* Whether to ignore case when
                                     matching dirs and files */
- char *acceptregex_s; /* Patterns to accept (a regex string). */
-  char *rejectregex_s;          /* Patterns to reject (a regex string). */
-  void *acceptregex;            /* Patterns to accept (a regex struct). */
-  void *rejectregex;            /* Patterns to reject (a regex struct). */
+  char **acceptregex_s;          /* Patterns to accept (a regex string). */
+  char **rejectregex_s;          /* Patterns to reject (a regex string). */
+  void **acceptregex;            /* Patterns to accept (a regex struct). */
+  void **rejectregex;            /* Patterns to reject (a regex struct). */
    enum {
  #if defined HAVE_LIBPCRE || HAVE_LIBPCRE2
      regex_type_pcre,
diff --git a/src/utils.c b/src/utils.c
index 426cda31..853a00e1 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -1070,15 +1070,28 @@ acceptable (const char *s)
    return true;
  }
+/* Check whether s matches each regex pattern in rvec */
+static bool
+in_reglist (void **rvec, const char *s)
+{
+  for (; *rvec; rvec++)
+    if (opt.regex_match_fun(*rvec,s))
+      return true;
+
+  return false;
+}
+
  /* Determine whether an URL is acceptable to be followed, according to
     regex patterns to accept/reject.  */
  bool
  accept_url (const char *s)
  {
-  if (opt.acceptregex && !opt.regex_match_fun (opt.acceptregex, s))
-    return false;
-  if (opt.rejectregex && opt.regex_match_fun (opt.rejectregex, s))
-    return false;
+  if (opt.acceptregex && opt.rejectregex)
+    return in_reglist (opt.acceptregex, s) && !in_reglist (opt.rejectregex, s);
+  else if (opt.acceptregex)
+    return in_reglist (opt.acceptregex, s);
+  else if (opt.rejectregex)
+    return !in_reglist (opt.rejectregex, s);
return true;
  }
@@ -1482,6 +1495,29 @@ vec_append (char **vec, const char *str)
    return vec;
  }
+/* Compile regexes from given svec with function given in global opt and
+   save them to the given rvec. Return error status. */
+
+bool
+regex_vec_create(void ***rvec, const char **svec)
+{
+   int i;
+
+   for (i = 0; *svec; svec++, i++)
+     {
+       /* Cover NULL-terminator on realloc */
+       *rvec = xrealloc (*rvec, (i + 2) * sizeof (void *));
+       (*rvec)[i] = opt.regex_compile_fun (*svec);
+       if (!(*rvec)[i])
+         return false;
+     }
+   if (i == 0)
+     *rvec = xmalloc (sizeof (void *));
+   (*rvec)[i] = NULL;
+
+  return true;
+}
+
  /* Sometimes it's useful to create "sets" of strings, i.e. special
     hash tables where you want to store strings as keys and merely
     query for their existence.  Here is a set of utility routines that
@@ -2545,9 +2581,9 @@ match_posix_regex (const void *regex, const char *str)
      return true;
    else
      {
-      size_t errbuf_size = regerror (rc, opt.acceptregex, NULL, 0);
+      size_t errbuf_size = regerror (rc, regex, NULL, 0);
        char *errbuf = xmalloc (errbuf_size);
-      regerror (rc, opt.acceptregex, errbuf, errbuf_size);
+      regerror (rc, regex, errbuf, errbuf_size);
        logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
                   quote (str), rc);
        xfree (errbuf);
diff --git a/src/utils.h b/src/utils.h
index 51062eff..161df97d 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -112,6 +112,7 @@ void wget_read_file_free (struct file_memory *);
  void free_vec (char **);
  char **merge_vecs (char **, char **);
  char **vec_append (char **, const char *);
+bool regex_vec_create (void ***, const char **);
void string_set_add (struct hash_table *, const char *);
  int string_set_contains (struct hash_table *, const char *);


Attachment: OpenPGP_signature
Description: OpenPGP digital signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]