bug-wget
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 1/3] Support for multiple regex patterns for URL matching


From: Nekun
Subject: [PATCH 1/3] Support for multiple regex patterns for URL matching
Date: Sat, 1 May 2021 06:39:48 +0000

Wget allows to pass --{accept,reject}-regex multiple times, but
internally it has only single string for these options and only
the last given option has effect in result. It obviously may produce
very confusing behavior on recursive fetching. To resolve it,
add support for multiple regex matching similiarly with existing
--{accept,reject} logic.

* src/init.c: Replace str type by vector type for corresponding opts
* src/main.c: Create regex vector from string vector
* src/options.h: Change types of {accept,reject}regex{,_s} opts to vector
* src/utils.h: (accept_url): Search matches in regex vector
  (match_posix_regex): Pass regex in param to regex error handler
  (in_reglist) Add func for searching matches in regex vector
  (regex_vec_create) Add func for creating regex vector
* src/utils.h: Add regex_vec_create definition
---
 src/init.c    | 19 +++++++++++--------
 src/main.c    |  8 ++++----
 src/options.h |  8 ++++----
 src/utils.c   | 48 ++++++++++++++++++++++++++++++++++++++++++------
 src/utils.h   |  1 +
 5 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/src/init.c b/src/init.c
index 935584a0..0ad70f70 100644
--- a/src/init.c
+++ b/src/init.c
@@ -135,7 +135,7 @@ static const struct {
 } commands[] = {
   /* KEEP THIS LIST ALPHABETICALLY SORTED */
   { "accept",           &opt.accepts,           cmd_vector },
-  { "acceptregex",      &opt.acceptregex_s,     cmd_string },
+  { "acceptregex",      &opt.acceptregex_s,     cmd_vector },
   { "addhostdir",       &opt.add_hostdir,       cmd_boolean },
   { "adjustextension",  &opt.adjust_extension,  cmd_boolean },
   { "alwaysrest",       &opt.always_rest,       cmd_boolean }, /* deprecated */
@@ -304,7 +304,7 @@ static const struct {
   { "regextype",        &opt.regex_type,        cmd_spec_regex_type },
   { "reject",           &opt.rejects,           cmd_vector },
   { "rejectedlog",      &opt.rejected_log,      cmd_file },
-  { "rejectregex",      &opt.rejectregex_s,     cmd_string },
+  { "rejectregex",      &opt.rejectregex_s,     cmd_vector },
   { "relativeonly",     &opt.relative_only,     cmd_boolean },
   { "remoteencoding",   &opt.encoding_remote,   cmd_string },
   { "removelisting",    &opt.remove_listing,    cmd_boolean },
@@ -1982,15 +1982,18 @@ cleanup (void)
   xfree (opt.default_page);
   if (opt.regex_type == regex_type_posix)
     {
+      int i;
       if (opt.acceptregex)
-        regfree (opt.acceptregex);
+       for (i = 0; opt.acceptregex[i]; i++)
+         regfree (opt.acceptregex[i]);
       if (opt.rejectregex)
-        regfree (opt.rejectregex);
+        for (i = 0; opt.rejectregex[i]; i++)
+         regfree (opt.rejectregex[i]);
     }
-  xfree (opt.acceptregex);
-  xfree (opt.rejectregex);
-  xfree (opt.acceptregex_s);
-  xfree (opt.rejectregex_s);
+  free_vec ((char **)opt.acceptregex);
+  free_vec ((char **)opt.rejectregex);
+  free_vec (opt.acceptregex_s);
+  free_vec (opt.rejectregex_s);
   free_vec (opt.accepts);
   free_vec (opt.rejects);
   free_vec ((char **)opt.excludes);
diff --git a/src/main.c b/src/main.c
index e506b211..db61640c 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1819,14 +1819,14 @@ for details.\n\n"));
     }
   if (opt.acceptregex_s)
     {
-      opt.acceptregex = opt.regex_compile_fun (opt.acceptregex_s);
-      if (!opt.acceptregex)
+      if (!regex_vec_create (&opt.acceptregex,
+          (const char **) opt.acceptregex_s))
         exit (WGET_EXIT_GENERIC_ERROR);
     }
   if (opt.rejectregex_s)
     {
-      opt.rejectregex = opt.regex_compile_fun (opt.rejectregex_s);
-      if (!opt.rejectregex)
+      if (!regex_vec_create (&opt.rejectregex,
+          (const char **) opt.rejectregex_s))
         exit (WGET_EXIT_GENERIC_ERROR);
     }
   if (opt.post_data || opt.post_file_name)
diff --git a/src/options.h b/src/options.h
index a0e9c60d..e701142c 100644
--- a/src/options.h
+++ b/src/options.h
@@ -88,10 +88,10 @@ struct options
   bool ignore_case;             /* Whether to ignore case when
                                    matching dirs and files */
 
-  char *acceptregex_s;          /* Patterns to accept (a regex string). */
-  char *rejectregex_s;          /* Patterns to reject (a regex string). */
-  void *acceptregex;            /* Patterns to accept (a regex struct). */
-  void *rejectregex;            /* Patterns to reject (a regex struct). */
+  char **acceptregex_s;          /* Patterns to accept (a regex string). */
+  char **rejectregex_s;          /* Patterns to reject (a regex string). */
+  void **acceptregex;            /* Patterns to accept (a regex struct). */
+  void **rejectregex;            /* Patterns to reject (a regex struct). */
   enum {
 #if defined HAVE_LIBPCRE || HAVE_LIBPCRE2
     regex_type_pcre,
diff --git a/src/utils.c b/src/utils.c
index 426cda31..853a00e1 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -1070,15 +1070,28 @@ acceptable (const char *s)
   return true;
 }
 
+/* Check whether s matches each regex pattern in rvec */
+static bool
+in_reglist (void **rvec, const char *s)
+{
+  for (; *rvec; rvec++)
+    if (opt.regex_match_fun(*rvec,s))
+      return true;
+
+  return false;
+}
+
 /* Determine whether an URL is acceptable to be followed, according to
    regex patterns to accept/reject.  */
 bool
 accept_url (const char *s)
 {
-  if (opt.acceptregex && !opt.regex_match_fun (opt.acceptregex, s))
-    return false;
-  if (opt.rejectregex && opt.regex_match_fun (opt.rejectregex, s))
-    return false;
+  if (opt.acceptregex && opt.rejectregex)
+    return in_reglist (opt.acceptregex, s) && !in_reglist (opt.rejectregex, s);
+  else if (opt.acceptregex)
+    return in_reglist (opt.acceptregex, s);
+  else if (opt.rejectregex)
+    return !in_reglist (opt.rejectregex, s);
 
   return true;
 }
@@ -1482,6 +1495,29 @@ vec_append (char **vec, const char *str)
   return vec;
 }
 
+/* Compile regexes from given svec with function given in global opt and 
+   save them to the given rvec. Return error status. */
+
+bool
+regex_vec_create(void ***rvec, const char **svec)
+{
+   int i;
+
+   for (i = 0; *svec; svec++, i++)
+     {
+       /* Cover NULL-terminator on realloc */
+       *rvec = xrealloc (*rvec, (i + 2) * sizeof (void *));
+       (*rvec)[i] = opt.regex_compile_fun (*svec);
+       if (!(*rvec)[i])
+         return false;
+     }
+   if (i == 0)
+     *rvec = xmalloc (sizeof (void *));
+   (*rvec)[i] = NULL;
+
+  return true;
+}
+
 /* Sometimes it's useful to create "sets" of strings, i.e. special
    hash tables where you want to store strings as keys and merely
    query for their existence.  Here is a set of utility routines that
@@ -2545,9 +2581,9 @@ match_posix_regex (const void *regex, const char *str)
     return true;
   else
     {
-      size_t errbuf_size = regerror (rc, opt.acceptregex, NULL, 0);
+      size_t errbuf_size = regerror (rc, regex, NULL, 0);
       char *errbuf = xmalloc (errbuf_size);
-      regerror (rc, opt.acceptregex, errbuf, errbuf_size);
+      regerror (rc, regex, errbuf, errbuf_size);
       logprintf (LOG_VERBOSE, _("Error while matching %s: %d\n"),
                  quote (str), rc);
       xfree (errbuf);
diff --git a/src/utils.h b/src/utils.h
index 51062eff..161df97d 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -112,6 +112,7 @@ void wget_read_file_free (struct file_memory *);
 void free_vec (char **);
 char **merge_vecs (char **, char **);
 char **vec_append (char **, const char *);
+bool regex_vec_create (void ***, const char **);
 
 void string_set_add (struct hash_table *, const char *);
 int string_set_contains (struct hash_table *, const char *);
-- 
2.26.3





reply via email to

[Prev in Thread] Current Thread [Next in Thread]