uerr_t err, ret = TRYLIMEXC;
time_t tmr = -1; /* remote time-stamp */
struct http_stat hstat; /* HTTP status */
- struct_stat st;
+ struct_stat st;
bool send_head_first = true;
+ char *file_name;
/* Assert that no value for *LOCAL_FILE was passed. */
assert (local_file == NULL || *local_file == NULL);
/* Send preliminary HEAD request if -N is given and we have an existing
* destination file. */
- if (opt.timestamping
+ file_name = url_file_name (u);
+ if (opt.timestamping
&& !opt.content_disposition
- && file_exists_p (url_file_name (u)))
+ && file_exists_p (file_name))
send_head_first = true;
-
+ xfree (file_name);
+
/* THE loop */
do
{
for (t = url; *t; t++)
{
char *filename = NULL, *redirected_URL = NULL;
- int dt;
+ int dt, url_err;
- struct url *url_parsed = url_parse (*t, &url_err);
++ struct url *url_parsed = url_parse (*t, &url_err, NULL, false);
- if ((opt.recursive || opt.page_requisites)
- && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t)))
+ if (!url_parsed)
{
- int old_follow_ftp = opt.follow_ftp;
-
- /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- if (url_scheme (*t) == SCHEME_FTP)
- opt.follow_ftp = 1;
-
- status = retrieve_tree (*t, NULL);
-
- opt.follow_ftp = old_follow_ftp;
+ char *error = url_error (*t, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n",*t, error);
+ xfree (error);
+ status = URLERROR;
}
else
{
- struct iri *i = iri_new ();
- set_uri_encoding (i, opt.locale, true);
- status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt,
- opt.recursive, i);
- iri_free (i);
- }
+ if ((opt.recursive || opt.page_requisites)
+ && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (url_parsed)))
+ {
+ int old_follow_ftp = opt.follow_ftp;
- if (opt.delete_after && file_exists_p(filename))
- {
- DEBUGP (("Removing file due to --delete-after in main():\n"));
- logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
- if (unlink (filename))
- logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
- }
+ /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
+ if (url_scheme (*t) == SCHEME_FTP)
+ opt.follow_ftp = 1;
+
- status = retrieve_tree (url_parsed);
++ status = retrieve_tree (url_parsed, NULL);
- xfree_null (redirected_URL);
- xfree_null (filename);
+ opt.follow_ftp = old_follow_ftp;
+ }
+ else
- status = retrieve_url (url_parsed, *t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
++ {
++ struct iri *i = iri_new ();
++ set_uri_encoding (i, opt.locale, true);
++ status = retrieve_url (url_parsed, *t, &filename, &redirected_URL,
++ NULL, &dt, opt.recursive, i);
++ iri_free (i);
++ }
+
+ if (opt.delete_after && file_exists_p(filename))
+ {
+ DEBUGP (("Removing file due to --delete-after in main():\n"));
+ logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
+ if (unlink (filename))
+ logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
+ }
+ xfree_null (redirected_URL);
+ xfree_null (filename);
+ url_free (url_parsed);
+ }
}
/* And then from the input file, if any. */
}
\f
static bool download_child_p (const struct urlpos *, struct url *, int,
- struct url *, struct hash_table *);
+ struct url *, struct hash_table *, struct iri *);
-static bool descend_redirect_p (const char *, const char *, int,
+static bool descend_redirect_p (const char *, struct url *, int,
- struct url *, struct hash_table *);
+ struct url *, struct hash_table *, struct iri *);
/* Retrieve a part of the web beginning with START_URL. This used to
options, add it to the queue. */
uerr_t
- retrieve_tree (struct url *start_url_parsed)
-retrieve_tree (const char *start_url, struct iri *pi)
++retrieve_tree (struct url *start_url_parsed, struct iri *pi)
{
uerr_t status = RETROK;
the queue, but haven't been downloaded yet. */
struct hash_table *blacklist;
- struct url *start_url_parsed;
+ int up_error_code;
- start_url_parsed = url_parse (start_url, &up_error_code, i, true);
- if (!start_url_parsed)
- {
- char *error = url_error (start_url, up_error_code);
- logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, error);
- xfree (error);
- return URLERROR;
- }
-
+ struct iri *i = iri_new ();
+
+ #define COPYSTR(x) (x) ? xstrdup(x) : NULL;
+ /* Duplicate pi struct if not NULL */
+ if (pi)
+ {
+ i->uri_encoding = COPYSTR (pi->uri_encoding);
+ i->content_encoding = COPYSTR (pi->content_encoding);
+ i->utf8_encode = pi->utf8_encode;
+ }
+ else
+ set_uri_encoding (i, opt.locale, true);
+ #undef COPYSTR
+
queue = url_queue_new ();
blacklist = make_string_hash_table (0);
}
else
{
- int dt = 0;
+ int dt = 0, url_err;
char *redirected = NULL;
- struct url *url_parsed = url_parse (url, &url_err);
++ struct url *url_parsed = url_parse (url, &url_err, i, false);
- if (!url_parsed)
- {
- char *error = url_error (url, url_err);
- logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
- xfree (error);
- status = URLERROR;
- }
- else
- {
- status = retrieve_url (url_parsed, url, &file, &redirected,
- referer, &dt, false);
- }
- status = retrieve_url (url, &file, &redirected, referer, &dt,
- false, i);
++ status = retrieve_url (url_parsed, url, &file, &redirected, referer,
++ &dt, false, i);
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
want to follow it. */
if (descend)
{
- if (!descend_redirect_p (redirected, url, depth,
+ if (!descend_redirect_p (redirected, url_parsed, depth,
- start_url_parsed, blacklist))
+ start_url_parsed, blacklist, i))
descend = false;
else
/* Make sure that the old pre-redirect form gets
it is merely a simple-minded wrapper around download_child_p. */
static bool
-descend_redirect_p (const char *redirected, const char *original, int depth,
+descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth,
- struct url *start_url_parsed, struct hash_table *blacklist)
+ struct url *start_url_parsed, struct hash_table *blacklist,
+ struct iri *iri)
{
- struct url *orig_parsed, *new_parsed;
+ struct url *new_parsed;
struct urlpos *upos;
bool success;
- orig_parsed = url_parse (original, NULL, NULL, false);
assert (orig_parsed != NULL);
- new_parsed = url_parse (redirected, NULL);
+ new_parsed = url_parse (redirected, NULL, NULL, false);
assert (new_parsed != NULL);
upos = xnew0 (struct urlpos);
upos->url = new_parsed;
success = download_child_p (upos, orig_parsed, depth,
- start_url_parsed, blacklist);
+ start_url_parsed, blacklist, iri);
- url_free (orig_parsed);
url_free (new_parsed);
xfree (upos);
struct urlpos;
void recursive_cleanup (void);
- uerr_t retrieve_tree (struct url *);
-uerr_t retrieve_tree (const char *, struct iri *);
++uerr_t retrieve_tree (struct url *, struct iri *);
#endif /* RECUR_H */
uerr_t err;
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
int saved_ts_val = opt.timestamping;
- int saved_sp_val = opt.spider;
+ int saved_sp_val = opt.spider, url_err;
+ struct url * url_parsed;
+ /* Copy server URI encoding for a possible IDNA transformation, no need to
+ encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
+ set_uri_encoding (i, iri->uri_encoding, false);
+ i->utf8_encode = false;
+
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
*file = NULL;
opt.timestamping = false;
opt.spider = false;
- err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i);
+
- url_parsed = url_parse (robots_url, &url_err);
++ url_parsed = url_parse (robots_url, &url_err, iri, true);
+ if (!url_parsed)
+ {
+ char *error = url_error (robots_url, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
+ xfree (error);
+ err = URLERROR;
+ }
+ else
+ {
+ err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
- false);
++ false, i);
+ url_free(url_parsed);
+ }
+
opt.timestamping = saved_ts_val;
- opt.spider = saved_sp_val;
+ opt.spider = saved_sp_val;
xfree (robots_url);
+ iri_free (i);
if (err != RETROK && *file != NULL)
{
multiple points. */
uerr_t
-retrieve_url (const char *origurl, char **file, char **newloc,
- const char *refurl, int *dt, bool recursive, struct iri *iri)
+retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
- char **newloc, const char *refurl, int *dt, bool recursive)
++ char **newloc, const char *refurl, int *dt, bool recursive,
++ struct iri *iri)
{
uerr_t result;
char *url;
if (file)
*file = NULL;
- u = url_parse (url, &up_error_code, iri, true);
- if (!u)
- {
- char *error = url_error (url, up_error_code);
- logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
- xfree (url);
- xfree (error);
- return URLERROR;
- }
-
+ second_try:
+ DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url),
+ iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None",
+ iri->utf8_encode));
+
if (!refurl)
refurl = opt.referer;
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
-
+
+ /* sXXXav : Assume filename and links in the file are in the locale */
+ set_uri_encoding (iri, opt.locale, true);
+ set_content_encoding (iri, opt.locale);
+
if (url_has_scheme (url))
{
- int dt;
+ int dt,url_err;
uerr_t status;
- struct url * url_parsed = url_parse(url, &url_err);
++ struct url * url_parsed = url_parse(url, &url_err, NULL, true);
+
+ if (!url_parsed)
+ {
+ char *error = url_error (url, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
+ xfree (error);
+ return URLERROR;
+ }
if (!opt.base_href)
opt.base_href = xstrdup (url);
- status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt, false);
- status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri);
++ status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt,
++ false, iri);
if (status != RETROK)
return status;
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- if (cur_url->url->scheme == SCHEME_FTP)
+ if (cur_url->url->scheme == SCHEME_FTP)
opt.follow_ftp = 1;
-
- status = retrieve_tree (cur_url->url);
+
- status = retrieve_tree (cur_url->url->url, iri);
++ status = retrieve_tree (cur_url->url, iri);
opt.follow_ftp = old_follow_ftp;
}
else
- {
- status = retrieve_url (cur_url->url, cur_url->url->url, &filename,
- &new_file, NULL, &dt, opt.recursive);
- }
- status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL,
- &dt, opt.recursive, iri);
++ status = retrieve_url (cur_url->url, cur_url->url->url, &filename,
++ &new_file, NULL, &dt, opt.recursive, iri);
if (filename && opt.delete_after && file_exists_p (filename))
{
char *fd_read_hunk (int, hunk_terminator_t, long, long);
char *fd_read_line (int);
- uerr_t retrieve_url (struct url *, const char *, char **, char **, const char *, int *, bool);
-uerr_t retrieve_url (const char *, char **, char **, const char *, int *,
- bool, struct iri *);
++uerr_t retrieve_url (struct url *, const char *, char **, char **,
++ const char *, int *, bool, struct iri *);
uerr_t retrieve_from_file (const char *, bool, int *);
const char *retr_rate (wgint, double);
int port;
char *user = NULL, *passwd = NULL;
- char *url_encoded = NULL;
- char *url_encoded = NULL, *new_url = NULL;
++ const char *url_encoded = NULL;
++ char *new_url = NULL;
int error_code;
if (url_encoded == url)
u->url = xstrdup (url);
else
-- u->url = url_encoded;
++ u->url = (char *) url_encoded;
}
return u;
error:
/* Cleanup in case of error: */
if (url_encoded && url_encoded != url)
-- xfree (url_encoded);
++ xfree ((char *) url_encoded);
/* Transmit the error code to the caller, if the caller wants to
know. */
+ 2008-12-04 Micah Cowan <micah@cowan.name> (not copyrightable)
+
+ * run-px, Test-idn-robots.px: Added test for robots-file
+ downloads.
+
+ * Test-idn-cmd.px, Test-idn-meta.px, Test-idn-headers.px:
+ Fix test names.
+
+ 2008-11-26 Micah Cowan <micah@cowan.name> (not copyrightable)
+
+ * Test-ftp-iri-disabled.px, Test-ftp-iri-fallback.px,
+ Test-ftp-iri.px, Test-idn-cmd.px, Test-idn-headers.px,
+ Test-idn-meta.px, Test-iri-disabled.px,
+ Test-iri-forced-remote.px, Test-iri-list.px, Test-iri.px: More
+ module-scope warnings.
+
+2009-06-14 Micah Cowan <micah@cowan.name>
+
+ * Makefile.am (EXTRA_DIST): Include all the tests, run-px, and
+ certs/, to make distcheck happy.
+
+2009-06-11 Benjamin Wolsey <bwy@benjaminwolsey.de>
+
+ * Test-proxied-https-auth.px: Take an optional argument for the
+ top source directory, so we can find the cert and key.
+
+ * run-px: Provide the top source directory as an argument, so
+ scripts can find their way around.
+
+2009-04-11 Steven Schubiger <stsc@member.fsf.org>
+
+ * run-px: Skip testing with real rc files by setting
+ SYSTEM_WGETRC and WGETRC to /dev/null.
+
+2009-02-25 Benjamin Wolsey <bwy@benjaminwolsey.de>
+
+ * Makefile.am (run-px-tests): Ensure run-px is run from srcdir.
+
+ * run-px: Include modules from srcdir.
+
2008-11-25 Steven Schubiger <stsc@members.fsf.org>
* WgetTest.pm.in: Remove the magic interpreter line;