From: Micah Cowan Date: Thu, 25 Jun 2009 08:14:11 +0000 (-0700) Subject: Merge with mainline. X-Git-Tag: v1.13~338 X-Git-Url: http://sjero.net/git/?p=wget;a=commitdiff_plain;h=4f3dd6817348433eafde04a3c2946f43364de7ef Merge with mainline. --- 4f3dd6817348433eafde04a3c2946f43364de7ef diff --cc src/http.c index 50f0c643,9ed226cb..ae89c46d --- a/src/http.c +++ b/src/http.c @@@ -2359,9 -2355,8 +2371,9 @@@ http_loop (struct url *u, char **newloc uerr_t err, ret = TRYLIMEXC; time_t tmr = -1; /* remote time-stamp */ struct http_stat hstat; /* HTTP status */ - struct_stat st; + struct_stat st; bool send_head_first = true; + char *file_name; /* Assert that no value for *LOCAL_FILE was passed. */ assert (local_file == NULL || *local_file == NULL); @@@ -2434,13 -2429,11 +2446,13 @@@ File %s already there; not retrieving.\ /* Send preliminary HEAD request if -N is given and we have an existing * destination file. */ + file_name = url_file_name (u); - if (opt.timestamping + if (opt.timestamping && !opt.content_disposition - && file_exists_p (url_file_name (u))) + && file_exists_p (file_name)) send_head_first = true; - + xfree (file_name); + /* THE loop */ do { diff --cc src/main.c index b8039d6b,a2d40888..69df08a7 --- a/src/main.c +++ b/src/main.c @@@ -1178,45 -1202,40 +1202,51 @@@ WARNING: Can't reopen standard output i for (t = url; *t; t++) { char *filename = NULL, *redirected_URL = NULL; - int dt; + int dt, url_err; - struct url *url_parsed = url_parse (*t, &url_err); ++ struct url *url_parsed = url_parse (*t, &url_err, NULL, false); - if ((opt.recursive || opt.page_requisites) - && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (*t))) + if (!url_parsed) { - int old_follow_ftp = opt.follow_ftp; - - /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (url_scheme (*t) == SCHEME_FTP) - opt.follow_ftp = 1; - - status = retrieve_tree (*t, NULL); - - opt.follow_ftp = old_follow_ftp; + char *error = url_error (*t, url_err); + logprintf (LOG_NOTQUIET, "%s: %s.\n",*t, error); + xfree (error); + status = URLERROR; } else { - struct iri *i = iri_new (); - set_uri_encoding (i, opt.locale, true); - status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, - opt.recursive, i); - iri_free (i); - } + if ((opt.recursive || opt.page_requisites) + && (url_scheme (*t) != SCHEME_FTP || url_uses_proxy (url_parsed))) + { + int old_follow_ftp = opt.follow_ftp; - if (opt.delete_after && file_exists_p(filename)) - { - DEBUGP (("Removing file due to --delete-after in main():\n")); - logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename); - if (unlink (filename)) - logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); - } + /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ + if (url_scheme (*t) == SCHEME_FTP) + opt.follow_ftp = 1; + - status = retrieve_tree (url_parsed); ++ status = retrieve_tree (url_parsed, NULL); - xfree_null (redirected_URL); - xfree_null (filename); + opt.follow_ftp = old_follow_ftp; + } + else - status = retrieve_url (url_parsed, *t, &filename, &redirected_URL, NULL, &dt, opt.recursive); ++ { ++ struct iri *i = iri_new (); ++ set_uri_encoding (i, opt.locale, true); ++ status = retrieve_url (url_parsed, *t, &filename, &redirected_URL, ++ NULL, &dt, opt.recursive, i); ++ iri_free (i); ++ } + + if (opt.delete_after && file_exists_p(filename)) + { + DEBUGP (("Removing file due to --delete-after in main():\n")); + logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename); + if (unlink (filename)) + logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); + } + xfree_null (redirected_URL); + xfree_null (filename); + url_free (url_parsed); + } } /* And then from the input file, if any. */ diff --cc src/recur.c index 2e067505,95581486..83a9b4ee --- a/src/recur.c +++ b/src/recur.c @@@ -153,9 -160,9 +160,9 @@@ url_dequeue (struct url_queue *queue, s } static bool download_child_p (const struct urlpos *, struct url *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *, struct iri *); -static bool descend_redirect_p (const char *, const char *, int, +static bool descend_redirect_p (const char *, struct url *, int, - struct url *, struct hash_table *); + struct url *, struct hash_table *, struct iri *); /* Retrieve a part of the web beginning with START_URL. This used to @@@ -180,7 -187,7 +187,7 @@@ options, add it to the queue. */ uerr_t - retrieve_tree (struct url *start_url_parsed) -retrieve_tree (const char *start_url, struct iri *pi) ++retrieve_tree (struct url *start_url_parsed, struct iri *pi) { uerr_t status = RETROK; @@@ -191,6 -198,31 +198,21 @@@ the queue, but haven't been downloaded yet. */ struct hash_table *blacklist; + int up_error_code; - struct url *start_url_parsed; + struct iri *i = iri_new (); + + #define COPYSTR(x) (x) ? xstrdup(x) : NULL; + /* Duplicate pi struct if not NULL */ + if (pi) + { + i->uri_encoding = COPYSTR (pi->uri_encoding); + i->content_encoding = COPYSTR (pi->content_encoding); + i->utf8_encode = pi->utf8_encode; + } + else + set_uri_encoding (i, opt.locale, true); + #undef COPYSTR + - start_url_parsed = url_parse (start_url, &up_error_code, i, true); - if (!start_url_parsed) - { - char *error = url_error (start_url, up_error_code); - logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url, error); - xfree (error); - return URLERROR; - } - queue = url_queue_new (); blacklist = make_string_hash_table (0); @@@ -253,22 -286,11 +276,12 @@@ } else { - int dt = 0; + int dt = 0, url_err; char *redirected = NULL; - struct url *url_parsed = url_parse (url, &url_err); ++ struct url *url_parsed = url_parse (url, &url_err, i, false); - if (!url_parsed) - { - char *error = url_error (url, url_err); - logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error); - xfree (error); - status = URLERROR; - } - else - { - status = retrieve_url (url_parsed, url, &file, &redirected, - referer, &dt, false); - } - status = retrieve_url (url, &file, &redirected, referer, &dt, - false, i); ++ status = retrieve_url (url_parsed, url, &file, &redirected, referer, ++ &dt, false, i); if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) @@@ -295,8 -317,8 +308,8 @@@ want to follow it. */ if (descend) { - if (!descend_redirect_p (redirected, url, depth, + if (!descend_redirect_p (redirected, url_parsed, depth, - start_url_parsed, blacklist)) + start_url_parsed, blacklist, i)) descend = false; else /* Make sure that the old pre-redirect form gets @@@ -656,24 -686,27 +676,25 @@@ download_child_p (const struct urlpos * it is merely a simple-minded wrapper around download_child_p. */ static bool -descend_redirect_p (const char *redirected, const char *original, int depth, +descend_redirect_p (const char *redirected, struct url *orig_parsed, int depth, - struct url *start_url_parsed, struct hash_table *blacklist) + struct url *start_url_parsed, struct hash_table *blacklist, + struct iri *iri) { - struct url *orig_parsed, *new_parsed; + struct url *new_parsed; struct urlpos *upos; bool success; - orig_parsed = url_parse (original, NULL, NULL, false); assert (orig_parsed != NULL); - new_parsed = url_parse (redirected, NULL); + new_parsed = url_parse (redirected, NULL, NULL, false); assert (new_parsed != NULL); upos = xnew0 (struct urlpos); upos->url = new_parsed; success = download_child_p (upos, orig_parsed, depth, - start_url_parsed, blacklist); + start_url_parsed, blacklist, iri); - url_free (orig_parsed); url_free (new_parsed); xfree (upos); diff --cc src/recur.h index 7eeb5642,515a382b..76c0ef5f --- a/src/recur.h +++ b/src/recur.h @@@ -44,6 -42,6 +44,6 @@@ as that of the covered work. * struct urlpos; void recursive_cleanup (void); - uerr_t retrieve_tree (struct url *); -uerr_t retrieve_tree (const char *, struct iri *); ++uerr_t retrieve_tree (struct url *, struct iri *); #endif /* RECUR_H */ diff --cc src/res.c index 20ffe1c8,0320d034..4b0ff82b --- a/src/res.c +++ b/src/res.c @@@ -537,32 -538,22 +538,38 @@@ res_retrieve_file (const char *url, cha uerr_t err; char *robots_url = uri_merge (url, RES_SPECS_LOCATION); int saved_ts_val = opt.timestamping; - int saved_sp_val = opt.spider; + int saved_sp_val = opt.spider, url_err; + struct url * url_parsed; + /* Copy server URI encoding for a possible IDNA transformation, no need to + encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ + set_uri_encoding (i, iri->uri_encoding, false); + i->utf8_encode = false; + logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); *file = NULL; opt.timestamping = false; opt.spider = false; - err = retrieve_url (robots_url, file, NULL, NULL, NULL, false, i); + - url_parsed = url_parse (robots_url, &url_err); ++ url_parsed = url_parse (robots_url, &url_err, iri, true); + if (!url_parsed) + { + char *error = url_error (robots_url, url_err); + logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error); + xfree (error); + err = URLERROR; + } + else + { + err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL, - false); ++ false, i); + url_free(url_parsed); + } + opt.timestamping = saved_ts_val; - opt.spider = saved_sp_val; + opt.spider = saved_sp_val; xfree (robots_url); + iri_free (i); if (err != RETROK && *file != NULL) { diff --cc src/retr.c index ffa84c38,1d9d7478..0fd936d0 --- a/src/retr.c +++ b/src/retr.c @@@ -597,8 -596,8 +597,9 @@@ static char *getproxy (struct url *) multiple points. */ uerr_t -retrieve_url (const char *origurl, char **file, char **newloc, - const char *refurl, int *dt, bool recursive, struct iri *iri) +retrieve_url (struct url * orig_parsed, const char *origurl, char **file, - char **newloc, const char *refurl, int *dt, bool recursive) ++ char **newloc, const char *refurl, int *dt, bool recursive, ++ struct iri *iri) { uerr_t result; char *url; @@@ -626,6 -625,21 +627,11 @@@ if (file) *file = NULL; + second_try: - u = url_parse (url, &up_error_code, iri, true); - if (!u) - { - char *error = url_error (url, up_error_code); - logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error); - xfree (url); - xfree (error); - return URLERROR; - } - + DEBUGP (("[IRI Retrieving %s with %s (UTF-8=%d)\n", quote_n (0, url), + iri->uri_encoding ? quote_n (1, iri->uri_encoding) : "None", + iri->utf8_encode)); + if (!refurl) refurl = opt.referer; @@@ -836,25 -862,20 +866,30 @@@ retrieve_from_file (const char *file, b status = RETROK; /* Suppose everything is OK. */ *count = 0; /* Reset the URL count. */ - + + /* sXXXav : Assume filename and links in the file are in the locale */ + set_uri_encoding (iri, opt.locale, true); + set_content_encoding (iri, opt.locale); + if (url_has_scheme (url)) { - int dt; + int dt,url_err; uerr_t status; - struct url * url_parsed = url_parse(url, &url_err); ++ struct url * url_parsed = url_parse(url, &url_err, NULL, true); + + if (!url_parsed) + { + char *error = url_error (url, url_err); + logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error); + xfree (error); + return URLERROR; + } if (!opt.base_href) opt.base_href = xstrdup (url); - status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt, false); - status = retrieve_url (url, &input_file, NULL, NULL, &dt, false, iri); ++ status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt, ++ false, iri); if (status != RETROK) return status; @@@ -886,18 -917,16 +931,16 @@@ int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (cur_url->url->scheme == SCHEME_FTP) + if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; - - status = retrieve_tree (cur_url->url); + - status = retrieve_tree (cur_url->url->url, iri); ++ status = retrieve_tree (cur_url->url, iri); opt.follow_ftp = old_follow_ftp; } else - { - status = retrieve_url (cur_url->url, cur_url->url->url, &filename, - &new_file, NULL, &dt, opt.recursive); - } - status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, - &dt, opt.recursive, iri); ++ status = retrieve_url (cur_url->url, cur_url->url->url, &filename, ++ &new_file, NULL, &dt, opt.recursive, iri); if (filename && opt.delete_after && file_exists_p (filename)) { diff --cc src/retr.h index 72be93b7,bb2e66d3..8854b684 --- a/src/retr.h +++ b/src/retr.h @@@ -53,7 -51,8 +53,8 @@@ typedef const char *(*hunk_terminator_t char *fd_read_hunk (int, hunk_terminator_t, long, long); char *fd_read_line (int); - uerr_t retrieve_url (struct url *, const char *, char **, char **, const char *, int *, bool); -uerr_t retrieve_url (const char *, char **, char **, const char *, int *, - bool, struct iri *); ++uerr_t retrieve_url (struct url *, const char *, char **, char **, ++ const char *, int *, bool, struct iri *); uerr_t retrieve_from_file (const char *, bool, int *); const char *retr_rate (wgint, double); diff --cc src/url.c index d416fcf7,86d099a7..4c22a9fc --- a/src/url.c +++ b/src/url.c @@@ -668,7 -668,7 +668,8 @@@ url_parse (const char *url, int *error int port; char *user = NULL, *passwd = NULL; - char *url_encoded = NULL; - char *url_encoded = NULL, *new_url = NULL; ++ const char *url_encoded = NULL; ++ char *new_url = NULL; int error_code; @@@ -875,7 -904,7 +905,7 @@@ if (url_encoded == url) u->url = xstrdup (url); else -- u->url = url_encoded; ++ u->url = (char *) url_encoded; } return u; @@@ -883,7 -912,7 +913,7 @@@ error: /* Cleanup in case of error: */ if (url_encoded && url_encoded != url) -- xfree (url_encoded); ++ xfree ((char *) url_encoded); /* Transmit the error code to the caller, if the caller wants to know. */ diff --cc tests/ChangeLog index 522bd202,d9ba6531..3dfc60a3 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@@ -1,27 -1,19 +1,43 @@@ + 2008-12-04 Micah Cowan (not copyrightable) + + * run-px, Test-idn-robots.px: Added test for robots-file + downloads. + + * Test-idn-cmd.px, Test-idn-meta.px, Test-idn-headers.px: + Fix test names. + + 2008-11-26 Micah Cowan (not copyrightable) + + * Test-ftp-iri-disabled.px, Test-ftp-iri-fallback.px, + Test-ftp-iri.px, Test-idn-cmd.px, Test-idn-headers.px, + Test-idn-meta.px, Test-iri-disabled.px, + Test-iri-forced-remote.px, Test-iri-list.px, Test-iri.px: More + module-scope warnings. + +2009-06-14 Micah Cowan + + * Makefile.am (EXTRA_DIST): Include all the tests, run-px, and + certs/, to make distcheck happy. + +2009-06-11 Benjamin Wolsey + + * Test-proxied-https-auth.px: Take an optional argument for the + top source directory, so we can find the cert and key. + + * run-px: Provide the top source directory as an argument, so + scripts can find their way around. + +2009-04-11 Steven Schubiger + + * run-px: Skip testing with real rc files by setting + SYSTEM_WGETRC and WGETRC to /dev/null. + +2009-02-25 Benjamin Wolsey + + * Makefile.am (run-px-tests): Ensure run-px is run from srcdir. + + * run-px: Include modules from srcdir. + 2008-11-25 Steven Schubiger * WgetTest.pm.in: Remove the magic interpreter line;