[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Bug-wget] Patch for bug #20382
From: |
gerel |
Subject: |
[Bug-wget] Patch for bug #20382 |
Date: |
Sun, 01 Feb 2009 12:06:07 -0800 (PST) |
Hi everyone,
Here I attach a patch for "bug #20382 : Wget parses the same URI multiple times
throughout the code".
Hope all is fine.
cheers
-gerel
##
Exporting patches:
# HG changeset patch
# User address@hidden
# Date 1233504392 10800
# Node ID 565d1abd73d20c891eadec22d87bb2c791f24e5b
# Parent 1b4062e241879e6718b08d88b2832fd358757e4f
one less call
diff -r 1b4062e241879e6718b08d88b2832fd358757e4f -r
565d1abd73d20c891eadec22d87bb2c791f24e5b src/recur.c
--- a/src/recur.c Mon Dec 01 22:59:03 2008 -0800
+++ b/src/recur.c Sun Feb 01 13:06:32 2009 -0300
@@ -154,7 +154,7 @@ url_dequeue (struct url_queue *queue,
static bool download_child_p (const struct urlpos *, struct url *, int,
struct url *, struct hash_table *);
-static bool descend_redirect_p (const char *, const char *, int,
+static bool descend_redirect_p (const char *, struct url *, int,
struct url *, struct hash_table *);
@@ -264,10 +264,21 @@ retrieve_tree (const char *start_url)
}
else
{
- int dt = 0;
+ int dt = 0, url_err;
char *redirected = NULL;
-
- status = retrieve_url (url, &file, &redirected, referer, &dt, false);
+ struct url *url_parsed = url_parse (url, &url_err);
+
+ if (!url_parsed)
+ {
+ char *error = url_error (url, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
+ xfree (error);
+ status = URLERROR;
+ }
+ else
+ {
+ status = retrieve_url (url, &file, &redirected, referer, &dt,
false);
+ }
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
@@ -294,7 +305,7 @@ retrieve_tree (const char *start_url)
want to follow it. */
if (descend)
{
- if (!descend_redirect_p (redirected, url, depth,
+ if (!descend_redirect_p (redirected, url_parsed, depth,
start_url_parsed, blacklist))
descend = false;
else
@@ -306,6 +317,7 @@ retrieve_tree (const char *start_url)
xfree (url);
url = redirected;
}
+ url_free(url_parsed);
}
if (opt.spider)
@@ -656,14 +668,13 @@ download_child_p (const struct urlpos *u
it is merely a simple-minded wrapper around download_child_p. */
static bool
-descend_redirect_p (const char *redirected, const char *original, int depth,
+descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
struct url *start_url_parsed, struct hash_table *blacklist)
{
- struct url *orig_parsed, *new_parsed;
+ struct url *new_parsed;
struct urlpos *upos;
bool success;
- orig_parsed = url_parse (original, NULL);
assert (orig_parsed != NULL);
new_parsed = url_parse (redirected, NULL);
@@ -675,7 +686,6 @@ descend_redirect_p (const char *redirect
success = download_child_p (upos, orig_parsed, depth,
start_url_parsed, blacklist);
- url_free (orig_parsed);
url_free (new_parsed);
xfree (upos);
# HG changeset patch
# User "gerel <address@hidden>"
# Date 1233511431 10800
# Node ID 2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4
# Parent 565d1abd73d20c891eadec22d87bb2c791f24e5b
removed some more calls
diff -r 565d1abd73d20c891eadec22d87bb2c791f24e5b -r
2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 src/main.c
--- a/src/main.c Sun Feb 01 13:06:32 2009 -0300
+++ b/src/main.c Sun Feb 01 15:03:51 2009 -0300
@@ -1178,34 +1178,45 @@ WARNING: Can't reopen standard output in
for (t = url; *t; t++)
{
char *filename = NULL, *redirected_URL = NULL;
- int dt;
-
- if ((opt.recursive || opt.page_requisites)
- && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
- {
- int old_follow_ftp = opt.follow_ftp;
-
- /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- if (url_scheme (*t) == SCHEME_FTP)
- opt.follow_ftp = 1;
+ int dt, url_err;
+ struct url *url_parsed = url_parse (*t, &url_err);
+
+ if (!url_parsed)
+ {
+ char *error = url_error (*t, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n",*t, error);
+ xfree (error);
+ status = URLERROR;
+ }
+ else
+ {
+ if ((opt.recursive || opt.page_requisites)
+ && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy
(url_parsed)))
+ {
+ int old_follow_ftp = opt.follow_ftp;
+
+ /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
+ if (url_scheme (*t) == SCHEME_FTP)
+ opt.follow_ftp = 1;
- status = retrieve_tree (*t);
-
- opt.follow_ftp = old_follow_ftp;
- }
- else
- status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
opt.recursive);
-
- if (opt.delete_after && file_exists_p(filename))
- {
- DEBUGP (("Removing file due to --delete-after in main():\n"));
- logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
- if (unlink (filename))
- logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
- }
-
- xfree_null (redirected_URL);
- xfree_null (filename);
+ status = retrieve_tree (url_parsed);
+
+ opt.follow_ftp = old_follow_ftp;
+ }
+ else
+ status = retrieve_url (url_parsed, *t, &filename, &redirected_URL,
NULL, &dt, opt.recursive);
+
+ if (opt.delete_after && file_exists_p(filename))
+ {
+ DEBUGP (("Removing file due to --delete-after in main():\n"));
+ logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
+ if (unlink (filename))
+ logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
+ }
+ xfree_null (redirected_URL);
+ xfree_null (filename);
+ url_free (url_parsed);
+ }
}
/* And then from the input file, if any. */
diff -r 565d1abd73d20c891eadec22d87bb2c791f24e5b -r
2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 src/recur.c
--- a/src/recur.c Sun Feb 01 13:06:32 2009 -0300
+++ b/src/recur.c Sun Feb 01 15:03:51 2009 -0300
@@ -180,7 +180,7 @@ static bool descend_redirect_p (const ch
options, add it to the queue. */
uerr_t
-retrieve_tree (const char *start_url)
+retrieve_tree (struct url *start_url_parsed)
{
uerr_t status = RETROK;
@@ -190,17 +190,6 @@ retrieve_tree (const char *start_url)
/* The URLs we do not wish to enqueue, because they are already in
the queue, but haven't been downloaded yet. */
struct hash_table *blacklist;
-
- int up_error_code;
- struct url *start_url_parsed = url_parse (start_url, &up_error_code);
-
- if (!start_url_parsed)
- {
- char *error = url_error (start_url, up_error_code);
- logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, error);
- xfree (error);
- return URLERROR;
- }
queue = url_queue_new ();
blacklist = make_string_hash_table (0);
@@ -277,7 +266,8 @@ retrieve_tree (const char *start_url)
}
else
{
- status = retrieve_url (url, &file, &redirected, referer, &dt,
false);
+ status = retrieve_url (url_parsed, url, &file, &redirected,
+ referer, &dt, false);
}
if (html_allowed && file && status == RETROK
@@ -451,8 +441,6 @@ retrieve_tree (const char *start_url)
}
url_queue_delete (queue);
- if (start_url_parsed)
- url_free (start_url_parsed);
string_set_free (blacklist);
if (opt.quota && total_downloaded_bytes > opt.quota)
diff -r 565d1abd73d20c891eadec22d87bb2c791f24e5b -r
2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 src/recur.h
--- a/src/recur.h Sun Feb 01 13:06:32 2009 -0300
+++ b/src/recur.h Sun Feb 01 15:03:51 2009 -0300
@@ -31,6 +31,8 @@ as that of the covered work. */
#ifndef RECUR_H
#define RECUR_H
+#include "url.h"
+
/* For most options, 0 means no limits, but with -p in the picture,
that causes a problem on the maximum recursion depth variable. To
retain backwards compatibility we allow users to consider "0" to be
@@ -42,6 +44,6 @@ struct urlpos;
struct urlpos;
void recursive_cleanup (void);
-uerr_t retrieve_tree (const char *);
+uerr_t retrieve_tree (struct url *);
#endif /* RECUR_H */
diff -r 565d1abd73d20c891eadec22d87bb2c791f24e5b -r
2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 src/res.c
--- a/src/res.c Sun Feb 01 13:06:32 2009 -0300
+++ b/src/res.c Sun Feb 01 15:03:51 2009 -0300
@@ -537,13 +537,29 @@ res_retrieve_file (const char *url, char
uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
int saved_ts_val = opt.timestamping;
- int saved_sp_val = opt.spider;
+ int saved_sp_val = opt.spider, url_err;
+ struct url * url_parsed;
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
opt.timestamping = false;
opt.spider = false;
- err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
+
+ url_parsed = url_parse (robots_url, &url_err);
+ if (!url_parsed)
+ {
+ char *error = url_error (robots_url, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
+ xfree (error);
+ err = URLERROR;
+ }
+ else
+ {
+ err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
+ false);
+ url_free(url_parsed);
+ }
+
opt.timestamping = saved_ts_val;
opt.spider = saved_sp_val;
xfree (robots_url);
diff -r 565d1abd73d20c891eadec22d87bb2c791f24e5b -r
2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 src/retr.c
--- a/src/retr.c Sun Feb 01 13:06:32 2009 -0300
+++ b/src/retr.c Sun Feb 01 15:03:51 2009 -0300
@@ -596,15 +596,15 @@ static char *getproxy (struct url *);
multiple points. */
uerr_t
-retrieve_url (const char *origurl, char **file, char **newloc,
- const char *refurl, int *dt, bool recursive)
+retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
+ char **newloc, const char *refurl, int *dt, bool recursive)
{
uerr_t result;
char *url;
bool location_changed;
int dummy;
char *mynewloc, *proxy;
- struct url *u, *proxy_url;
+ struct url *u = orig_parsed, *proxy_url;
int up_error_code; /* url parse error code */
char *local_file;
int redirection_count = 0;
@@ -624,16 +624,6 @@ retrieve_url (const char *origurl, char
*newloc = NULL;
if (file)
*file = NULL;
-
- u = url_parse (url, &up_error_code);
- if (!u)
- {
- char *error = url_error (url, up_error_code);
- logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
- xfree (url);
- xfree (error);
- return URLERROR;
- }
if (!refurl)
refurl = opt.referer;
@@ -733,7 +723,10 @@ retrieve_url (const char *origurl, char
char *error = url_error (mynewloc, up_error_code);
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
error);
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
xfree (url);
xfree (mynewloc);
xfree (error);
@@ -753,7 +746,10 @@ retrieve_url (const char *origurl, char
logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
opt.max_redirect);
url_free (newloc_parsed);
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
xfree (url);
xfree (mynewloc);
RESTORE_POST_DATA;
@@ -762,7 +758,10 @@ retrieve_url (const char *origurl, char
xfree (url);
url = mynewloc;
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
u = newloc_parsed;
/* If we're being redirected from POST, we don't want to POST
@@ -795,7 +794,10 @@ retrieve_url (const char *origurl, char
else
xfree_null (local_file);
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
if (redirection_count)
{
@@ -836,13 +838,22 @@ retrieve_from_file (const char *file, bo
if (url_has_scheme (url))
{
- int dt;
+ int dt,url_err;
uerr_t status;
+ struct url * url_parsed = url_parse(url, &url_err);
+
+ if (!url_parsed)
+ {
+ char *error = url_error (url, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
+ xfree (error);
+ return URLERROR;
+ }
if (!opt.base_href)
opt.base_href = xstrdup (url);
- status = retrieve_url (url, &input_file, NULL, NULL, &dt, false);
+ status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt,
false);
if (status != RETROK)
return status;
@@ -877,12 +888,15 @@ retrieve_from_file (const char *file, bo
if (cur_url->url->scheme == SCHEME_FTP)
opt.follow_ftp = 1;
- status = retrieve_tree (cur_url->url->url);
+ status = retrieve_tree (cur_url->url);
opt.follow_ftp = old_follow_ftp;
}
else
- status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
&dt, opt.recursive);
+ {
+ status = retrieve_url (cur_url->url, cur_url->url->url, &filename,
+ &new_file, NULL, &dt, opt.recursive);
+ }
if (filename && opt.delete_after && file_exists_p (filename))
{
@@ -1050,14 +1064,12 @@ getproxy (struct url *u)
/* Returns true if URL would be downloaded through a proxy. */
bool
-url_uses_proxy (const char *url)
+url_uses_proxy (struct url * u)
{
bool ret;
- struct url *u = url_parse (url, NULL);
if (!u)
return false;
ret = getproxy (u) != NULL;
- url_free (u);
return ret;
}
diff -r 565d1abd73d20c891eadec22d87bb2c791f24e5b -r
2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 src/retr.h
--- a/src/retr.h Sun Feb 01 13:06:32 2009 -0300
+++ b/src/retr.h Sun Feb 01 15:03:51 2009 -0300
@@ -31,6 +31,8 @@ as that of the covered work. */
#ifndef RETR_H
#define RETR_H
+#include "url.h"
+
/* These global vars should be made static to retr.c and exported via
functions! */
extern SUM_SIZE_INT total_downloaded_bytes;
@@ -51,7 +53,7 @@ char *fd_read_hunk (int, hunk_terminator
char *fd_read_hunk (int, hunk_terminator_t, long, long);
char *fd_read_line (int);
-uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
bool);
+uerr_t retrieve_url (struct url *, const char *, char **, char **, const char
*, int *, bool);
uerr_t retrieve_from_file (const char *, bool, int *);
const char *retr_rate (wgint, double);
@@ -62,6 +64,6 @@ void sleep_between_retrievals (int);
void rotate_backups (const char *);
-bool url_uses_proxy (const char *);
+bool url_uses_proxy (struct url *);
#endif /* RETR_H */
# HG changeset patch
# User "gerel <address@hidden>"
# Date 1233513848 10800
# Node ID 292aea0c3956e94d74d4dfabd464fad6af375425
# Parent 2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4
added changelog entry
diff -r 2fe25ad6bedc11ab5b17a33ed87aaf14911ec4d4 -r
292aea0c3956e94d74d4dfabd464fad6af375425 ChangeLog
--- a/ChangeLog Sun Feb 01 15:03:51 2009 -0300
+++ b/ChangeLog Sun Feb 01 15:44:08 2009 -0300
@@ -1,3 +1,17 @@ 2008-11-10 Micah Cowan <address@hidden
+2009-02-01 Gerardo E. Gidoni <address@hidden>
+
+ * src/main.c: restructured code to avoid multiple 'url_parse' calls.
+
+ * src/recur.c: same.
+
+ * src/recur.h: same.
+
+ * src/res.c: same.
+
+ * src/retr.c: same.
+
+ * src/retr.h: same.
+
2008-11-10 Micah Cowan <address@hidden>
* MAILING-LIST: Mention Gmane, introduce subsections.
###
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Bug-wget] Patch for bug #20382,
gerel <=