>From af5170772750451d3d67028430f1d21075f6bfb1 Mon Sep 17 00:00:00 2001
From: Ander Juaristi
Date: Tue, 22 Sep 2015 21:10:38 +0200
Subject: [PATCH] Added --convert-file-only option
---
src/convert.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
src/convert.h | 2 ++
src/init.c | 2 ++
src/main.c | 24 +++++++++++----
src/options.h | 3 ++
5 files changed, 118 insertions(+), 8 deletions(-)
diff --git a/src/convert.c b/src/convert.c
index f0df9a0..35854cd 100644
--- a/src/convert.c
+++ b/src/convert.c
@@ -46,6 +46,7 @@ as that of the covered work. */
#include "html-url.h"
#include "css-url.h"
#include "iri.h"
+#include "xstrndup.h"
static struct hash_table *dl_file_url_map;
struct hash_table *dl_url_file_map;
@@ -136,8 +137,9 @@ convert_links_in_hashtable (struct hash_table *downloaded_set,
form. We do this even if the URL already is in
relative form, because our directory structure may
not be identical to that on the server (think `-nd',
- `--cut-dirs', etc.) */
- cur_url->convert = CO_CONVERT_TO_RELATIVE;
+ `--cut-dirs', etc.). If --convert-file-only was passed,
+ we only convert the basename portion of the URL. */
+ cur_url->convert = (opt.convert_file_only ? CO_CONVERT_BASENAME_ONLY : CO_CONVERT_TO_RELATIVE);
cur_url->local_name = xstrdup (local_name);
DEBUGP (("will convert url %s to local %s\n", u->url, local_name));
}
@@ -206,6 +208,7 @@ static const char *replace_attr_refresh_hack (const char *, int, FILE *,
const char *, int);
static char *local_quote_string (const char *, bool);
static char *construct_relative (const char *, const char *);
+static char *convert_basename (const char *, const struct urlpos *);
/* Change the links in one file. LINKS is a list of links in the
document, along with their positions and the desired direction of
@@ -315,9 +318,32 @@ convert_links (const char *file, struct urlpos *links)
DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",
link->url->url, newname, link->pos, file));
+
+ xfree (newname);
+ xfree (quoted_newname);
+ ++to_file_count;
+ break;
+ }
+ case CO_CONVERT_BASENAME_ONLY:
+ {
+ char *newname = convert_basename (p, link);
+ char *quoted_newname = local_quote_string (newname, link->link_css_p);
+
+ if (link->link_css_p)
+ p = replace_plain (p, link->size, fp, quoted_newname);
+ else if (!link->link_refresh_p)
+ p = replace_attr (p, link->size, fp, quoted_newname);
+ else
+ p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,
+ link->refresh_timeout);
+
+ DEBUGP (("Converted file part only: %s to %s at position %d in %s.\n",
+ link->url->url, newname, link->pos, file));
+
xfree (newname);
xfree (quoted_newname);
++to_file_count;
+
break;
}
case CO_CONVERT_TO_COMPLETE:
@@ -336,6 +362,7 @@ convert_links (const char *file, struct urlpos *links)
DEBUGP (("TO_COMPLETE: to %s at position %d in %s.\n",
newlink, link->pos, file));
+
xfree (quoted_newlink);
++to_url_count;
break;
@@ -422,6 +449,70 @@ construct_relative (const char *basefile, const char *linkfile)
return link;
}
+/* Construct and return a "transparent proxy" URL
+ reflecting changes made by --adjust-extension to the file component
+ (i.e., "basename") of the original URL, but leaving the "dirname"
+ of the URL (protocol://hostname... portion) untouched.
+
+ Think: populating a squid cache via a recursive wget scrape, where
+ changing URLs to work locally with "file://..." is NOT desirable.
+
+ Example:
+
+ if
+ p = "//foo.com/bar.cgi?xyz"
+ and
+ link->local_name = "docroot/foo.com/bar.cgi?xyz.css"
+ then
+
+ new_construct_func(p, link);
+ will return
+ "//foo.com/bar.cgi?xyz.css"
+
+ Essentially, we do s/$(basename orig_url)/$(basename link->local_name)/
+*/
+static char *
+convert_basename (const char *p, const struct urlpos *link)
+{
+ int len = link->size;
+ char *url = NULL;
+ char *org_basename = NULL, *local_basename = NULL;
+ char *result = url;
+
+ if (*p == '"' || *p == '\'')
+ {
+ len -= 2;
+ p++;
+ }
+
+ url = xstrndup (p, len);
+
+ org_basename = strrchr (url, '/');
+ if (org_basename)
+ org_basename++;
+ else
+ org_basename = url;
+
+ local_basename = strrchr (link->local_name, '/');
+ if (local_basename)
+ local_basename++;
+ else
+ local_basename = url;
+
+ /*
+ * If the basenames differ, graft the adjusted basename (local_basename)
+ * onto the original URL.
+ */
+ if (strcmp (org_basename, local_basename))
+ result = uri_merge (url, local_basename);
+ else
+ result = xstrdup (url);
+
+ xfree (url);
+
+ return result;
+}
+
/* Used by write_backup_file to remember which files have been
written. */
static struct hash_table *converted_files;
diff --git a/src/convert.h b/src/convert.h
index 3105db1..b3cd196 100644
--- a/src/convert.h
+++ b/src/convert.h
@@ -40,6 +40,8 @@ enum convert_options {
CO_NOCONVERT = 0, /* don't convert this URL */
CO_CONVERT_TO_RELATIVE, /* convert to relative, e.g. to
"../../otherdir/foo.gif" */
+ CO_CONVERT_BASENAME_ONLY, /* convert the file portion only (basename)
+ leaving the rest of the URL unchanged */
CO_CONVERT_TO_COMPLETE, /* convert to absolute, e.g. to
"http://orighost/somedir/bar.jpg". */
CO_NULLIFY_BASE /* change to empty string. */
diff --git a/src/init.c b/src/init.c
index dd1506c..67c94b9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -159,6 +159,7 @@ static const struct {
{ "contentdisposition", &opt.content_disposition, cmd_boolean },
{ "contentonerror", &opt.content_on_error, cmd_boolean },
{ "continue", &opt.always_rest, cmd_boolean },
+ { "convertfileonly", &opt.convert_file_only, cmd_boolean },
{ "convertlinks", &opt.convert_links, cmd_boolean },
{ "cookies", &opt.cookies, cmd_boolean },
#ifdef HAVE_SSL
@@ -377,6 +378,7 @@ defaults (void)
opt.htmlify = true;
opt.http_keep_alive = true;
opt.use_proxy = true;
+ opt.convert_file_only = false;
tmp = getenv ("no_proxy");
if (tmp)
opt.no_proxy = sepstring (tmp);
diff --git a/src/main.c b/src/main.c
index 92d1bee..cfdc361 100644
--- a/src/main.c
+++ b/src/main.c
@@ -264,6 +264,7 @@ static struct cmdline_option option_data[] =
{ "config", 0, OPT_VALUE, "chooseconfig", -1 },
{ "connect-timeout", 0, OPT_VALUE, "connecttimeout", -1 },
{ "continue", 'c', OPT_BOOLEAN, "continue", -1 },
+ { "convert-file-only", 0, OPT_BOOLEAN, "convertfileonly", -1 },
{ "convert-links", 'k', OPT_BOOLEAN, "convertlinks", -1 },
{ "content-disposition", 0, OPT_BOOLEAN, "contentdisposition", -1 },
{ "content-on-error", 0, OPT_BOOLEAN, "contentonerror", -1 },
@@ -877,6 +878,8 @@ Recursive download:\n"),
-k, --convert-links make links in downloaded HTML or CSS point to\n\
local files\n"),
N_("\
+ --convert-file-only convert the file part of the URLs only (usually known as the basename)\n"),
+ N_("\
--backups=N before writing file X, rotate up to N backup files\n"),
#ifdef __VMS
@@ -1387,11 +1390,14 @@ main (int argc, char **argv)
/* All user options have now been processed, so it's now safe to do
interoption dependency checks. */
- if (opt.noclobber && opt.convert_links)
+ if (opt.noclobber && (opt.convert_links || opt.convert_file_only))
{
fprintf (stderr,
- _("Both --no-clobber and --convert-links were specified,"
- " only --convert-links will be used.\n"));
+ opt.convert_links ?
+ _("Both --no-clobber and --convert-links were specified,"
+ " only --convert-links will be used.\n") :
+ _("Both --no-clobber and --convert-file-only were specified,"
+ " only --convert-file-only will be used.\n"));
opt.noclobber = false;
}
@@ -1445,11 +1451,11 @@ Can't timestamp and not clobber old files at the same time.\n"));
#endif
if (opt.output_document)
{
- if (opt.convert_links
+ if ((opt.convert_links || opt.convert_file_only)
&& (nurl > 1 || opt.page_requisites || opt.recursive))
{
fputs (_("\
-Cannot specify both -k and -O if multiple URLs are given, or in combination\n\
+Cannot specify both -k or --convert-file-only and -O if multiple URLs are given, or in combination\n\
with -p or -r. See the manual for details.\n\n"), stderr);
print_usage (1);
exit (WGET_EXIT_GENERIC_ERROR);
@@ -1760,6 +1766,12 @@ for details.\n\n"));
outputting to a regular file.\n"));
exit (WGET_EXIT_GENERIC_ERROR);
}
+ if (!output_stream_regular && (opt.convert_links || opt.convert_file_only))
+ {
+ fprintf (stderr, _("--convert-links or --convert-file-only can be used together \
+only if outputting to a regular file.\n"));
+ exit (WGET_EXIT_GENERIC_ERROR);
+ }
}
#ifdef __VMS
@@ -1970,7 +1982,7 @@ outputting to a regular file.\n"));
save_hsts ();
#endif
- if (opt.convert_links && !opt.delete_after)
+ if ((opt.convert_links || opt.convert_file_only) && !opt.delete_after)
convert_all_links ();
cleanup ();
diff --git a/src/options.h b/src/options.h
index 0bb7d71..dad08c1 100644
--- a/src/options.h
+++ b/src/options.h
@@ -182,6 +182,9 @@ struct options
NULL. */
bool convert_links; /* Will the links be converted
locally? */
+ bool convert_file_only; /* Convert only the file portion of the URI (i.e. basename).
+ Leave everything else untouched. */
+
bool remove_listing; /* Do we remove .listing files
generated by FTP? */
bool htmlify; /* Do we HTML-ify the OS-dependent
--
2.1.4