X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Fretr.c;h=b667ca2ff3cf6ecb4edca9c1148e856f68b0ecbe;hp=b5a10e2a03bc7cdc05c7182a0a73d3f0e5434e35;hb=d763f8bf6d6e13ce006ffab616cc8a77e747a633;hpb=714ccdcd844314cc3902fa4fd1b48757d9db9296 diff --git a/src/retr.c b/src/retr.c index b5a10e2a..b667ca2f 100644 --- a/src/retr.c +++ b/src/retr.c @@ -51,6 +51,8 @@ as that of the covered work. */ #include "hash.h" #include "convert.h" #include "ptimer.h" +#include "html-url.h" +#include "iri.h" /* Total size of downloaded files. Used to enforce quota. */ SUM_SIZE_INT total_downloaded_bytes; @@ -141,10 +143,8 @@ limit_bandwidth (wgint bytes, struct ptimer *timer) static int write_data (FILE *out, const char *buf, int bufsize, wgint *skip, - wgint *written, int flags) + wgint *written) { - static int cr_pending = 0; /* Found CR in ASCII FTP data. */ - if (!out) return 1; if (*skip > bufsize) @@ -161,72 +161,8 @@ write_data (FILE *out, const char *buf, int bufsize, wgint *skip, return 1; } -/* Note: This code assumes that "\n" is the universal line ending - character, as on UNIX and VMS. If this is not true, then here's - where to change it. -*/ - -#if 1 -# define EOL_STRING "\n" -#else /* 1 */ -# define EOL_STRING "\r\n" -#endif /* 1 [else] */ -#define EOL_STRING_LEN (sizeof( EOL_STRING)- 1) - - if (flags & rb_ftp_ascii) - { - const char *bufend; - - /* ASCII transfer. Put out lines delimited by CRLF. */ - bufend = buf+ bufsize; - while (buf < bufend) - { - /* If CR, put out any pending CR, then set CR-pending flag. */ - if (*buf == '\r') - { - if (cr_pending) - { - fwrite ("\r", 1, 1, out); - *written += 1; - } - cr_pending = 1; - buf++; - continue; - } - - if (cr_pending) - { - if (*buf == '\n') - { - /* Found FTP EOL (CRLF). Put out local EOL. */ - fwrite (EOL_STRING, 1, EOL_STRING_LEN, out); - *written += EOL_STRING_LEN; - } - else - { - /* Normal character. Put out pending CR and it. */ - fwrite ("\r", 1, 1, out); - fwrite (buf, 1, 1, out); - *written += 2; - } - buf++; - cr_pending = 0; - } - else - { - /* Normal character. Put it out. */ - fwrite (buf, 1, 1, out); - *written += 1; - buf++; - } - } - } - else - { - /* Image transfer. Put out buffer. */ - fwrite (buf, 1, bufsize, out); - *written += bufsize; - } + fwrite (buf, 1, bufsize, out); + *written += bufsize; /* Immediately flush the downloaded data. This should not hinder performance: fast downloads will arrive in large 16K chunks @@ -302,7 +238,8 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos, /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL argument to progress_create because the indicator doesn't (yet) know about "skipping" data. */ - progress = progress_create (skip ? 0 : startpos, startpos + toread); + wgint start = skip ? 0 : startpos; + progress = progress_create (start, start + toread); progress_interactive = progress_interactive_p (progress); } @@ -373,7 +310,7 @@ fd_read_body (int fd, FILE *out, wgint toread, wgint startpos, if (ret > 0) { sum_read += ret; - if (!write_data (out, dlbuf, ret, &skip, &sum_written, flags)) + if (!write_data (out, dlbuf, ret, &skip, &sum_written)) { ret = -2; goto out; @@ -469,7 +406,7 @@ fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize) char *hunk = xmalloc (bufsize); int tail = 0; /* tail position in HUNK */ - assert (maxsize >= bufsize); + assert (!maxsize || maxsize >= bufsize); while (1) { @@ -672,15 +609,17 @@ static char *getproxy (struct url *); multiple points. */ uerr_t -retrieve_url (const char *origurl, char **file, char **newloc, - const char *refurl, int *dt, bool recursive) +retrieve_url (struct url * orig_parsed, const char *origurl, char **file, + char **newloc, const char *refurl, int *dt, bool recursive, + struct iri *iri) { uerr_t result; char *url; bool location_changed; + bool iri_fallbacked = 0; int dummy; char *mynewloc, *proxy; - struct url *u, *proxy_url; + struct url *u = orig_parsed, *proxy_url; int up_error_code; /* url parse error code */ char *local_file; int redirection_count = 0; @@ -701,18 +640,11 @@ retrieve_url (const char *origurl, char **file, char **newloc, if (file) *file = NULL; - u = url_parse (url, &up_error_code); - if (!u) - { - logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code)); - xfree (url); - return URLERROR; - } - if (!refurl) refurl = opt.referer; redirected: + /* (also for IRI fallbacking) */ result = NOCONERROR; mynewloc = NULL; @@ -722,13 +654,19 @@ retrieve_url (const char *origurl, char **file, char **newloc, proxy = getproxy (u); if (proxy) { + struct iri *pi = iri_new (); + set_uri_encoding (pi, opt.locale, true); + pi->utf8_encode = false; + /* Parse the proxy URL. */ - proxy_url = url_parse (proxy, &up_error_code); + proxy_url = url_parse (proxy, &up_error_code, NULL, true); if (!proxy_url) { + char *error = url_error (proxy, up_error_code); logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"), - proxy, url_error (up_error_code)); + proxy, error); xfree (url); + xfree (error); RESTORE_POST_DATA; return PROXERR; } @@ -748,7 +686,7 @@ retrieve_url (const char *origurl, char **file, char **newloc, #endif || (proxy_url && proxy_url->scheme == SCHEME_HTTP)) { - result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url); + result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri); } else if (u->scheme == SCHEME_FTP) { @@ -798,15 +736,26 @@ retrieve_url (const char *origurl, char **file, char **newloc, xfree (mynewloc); mynewloc = construced_newloc; + /* Reset UTF-8 encoding state, keep the URI encoding and reset + the content encoding. */ + iri->utf8_encode = opt.enable_iri; + set_content_encoding (iri, NULL); + xfree_null (iri->orig_url); + /* Now, see if this new location makes sense. */ - newloc_parsed = url_parse (mynewloc, &up_error_code); + newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true); if (!newloc_parsed) { + char *error = url_error (mynewloc, up_error_code); logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc), - url_error (up_error_code)); - url_free (u); + error); + if (orig_parsed != u) + { + url_free (u); + } xfree (url); xfree (mynewloc); + xfree (error); RESTORE_POST_DATA; return result; } @@ -823,7 +772,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"), opt.max_redirect); url_free (newloc_parsed); - url_free (u); + if (orig_parsed != u) + { + url_free (u); + } xfree (url); xfree (mynewloc); RESTORE_POST_DATA; @@ -832,7 +784,10 @@ retrieve_url (const char *origurl, char **file, char **newloc, xfree (url); url = mynewloc; - url_free (u); + if (orig_parsed != u) + { + url_free (u); + } u = newloc_parsed; /* If we're being redirected from POST, we don't want to POST @@ -846,8 +801,33 @@ retrieve_url (const char *origurl, char **file, char **newloc, goto redirected; } - if (local_file) + /* Try to not encode in UTF-8 if fetching failed */ + if (!(*dt & RETROKF) && iri->utf8_encode) + { + iri->utf8_encode = false; + if (orig_parsed != u) + { + url_free (u); + } + u = url_parse (origurl, NULL, iri, true); + if (u) + { + DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url))); + url = xstrdup (u->url); + iri_fallbacked = 1; + goto redirected; + } + else + DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url))); + } + + if (local_file && *dt & RETROKF) { + register_download (u->url, local_file); + if (redirection_count && 0 != strcmp (origurl, u->url)) + register_redirection (origurl, u->url); + if (*dt & TEXTHTML) + register_html (u->url, local_file); if (*dt & RETROKF) { register_download (u->url, local_file); @@ -855,6 +835,8 @@ retrieve_url (const char *origurl, char **file, char **newloc, register_redirection (origurl, u->url); if (*dt & TEXTHTML) register_html (u->url, local_file); + if (*dt & TEXTCSS) + register_css (u->url, local_file); } } @@ -863,9 +845,12 @@ retrieve_url (const char *origurl, char **file, char **newloc, else xfree_null (local_file); - url_free (u); + if (orig_parsed != u) + { + url_free (u); + } - if (redirection_count) + if (redirection_count || iri_fallbacked) { if (newloc) *newloc = url; @@ -895,16 +880,65 @@ retrieve_from_file (const char *file, bool html, int *count) { uerr_t status; struct urlpos *url_list, *cur_url; + struct iri *iri = iri_new(); + + char *input_file = NULL; + const char *url = file; - url_list = (html ? get_urls_html (file, NULL, NULL) - : get_urls_file (file)); status = RETROK; /* Suppose everything is OK. */ *count = 0; /* Reset the URL count. */ + /* sXXXav : Assume filename and links in the file are in the locale */ + set_uri_encoding (iri, opt.locale, true); + set_content_encoding (iri, opt.locale); + + if (url_has_scheme (url)) + { + int dt,url_err; + uerr_t status; + struct url * url_parsed = url_parse(url, &url_err, iri, true); + + if (!url_parsed) + { + char *error = url_error (url, url_err); + logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error); + xfree (error); + return URLERROR; + } + + if (!opt.base_href) + opt.base_href = xstrdup (url); + + status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt, + false, iri); + if (status != RETROK) + return status; + + if (dt & TEXTHTML) + html = true; + + /* If we have a found a content encoding, use it. + * ( == is okay, because we're checking for identical object) */ + if (iri->content_encoding != opt.locale) + set_uri_encoding (iri, iri->content_encoding, false); + + /* Reset UTF-8 encode status */ + iri->utf8_encode = opt.enable_iri; + xfree_null (iri->orig_url); + iri->orig_url = NULL; + } + else + input_file = (char *) file; + + url_list = (html ? get_urls_html (input_file, NULL, NULL, iri) + : get_urls_file (input_file)); + for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count) { char *filename = NULL, *new_file = NULL; int dt; + struct iri *tmpiri = iri_dup (iri); + struct url *parsed_url = NULL; if (cur_url->ignore_when_downloading) continue; @@ -914,21 +948,32 @@ retrieve_from_file (const char *file, bool html, int *count) status = QUOTEXC; break; } + + /* Need to reparse the url, since it didn't have iri information. */ + if (opt.enable_iri) + parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true); + if ((opt.recursive || opt.page_requisites) && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url))) { int old_follow_ftp = opt.follow_ftp; /* Turn opt.follow_ftp on in case of recursive FTP retrieval */ - if (cur_url->url->scheme == SCHEME_FTP) + if (cur_url->url->scheme == SCHEME_FTP) opt.follow_ftp = 1; - - status = retrieve_tree (cur_url->url->url); + + status = retrieve_tree (parsed_url ? parsed_url : cur_url->url, + tmpiri); opt.follow_ftp = old_follow_ftp; } else - status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive); + status = retrieve_url (parsed_url ? parsed_url : cur_url->url, + cur_url->url->url, &filename, + &new_file, NULL, &dt, opt.recursive, tmpiri); + + if (parsed_url) + url_free (parsed_url); if (filename && opt.delete_after && file_exists_p (filename)) { @@ -942,11 +987,14 @@ Removing file due to --delete-after in retrieve_from_file():\n")); xfree_null (new_file); xfree_null (filename); + iri_free (tmpiri); } /* Free the linked list of URL-s. */ free_urlpos (url_list); + iri_free (iri); + return status; } @@ -1096,14 +1144,12 @@ getproxy (struct url *u) /* Returns true if URL would be downloaded through a proxy. */ bool -url_uses_proxy (const char *url) +url_uses_proxy (struct url * u) { bool ret; - struct url *u = url_parse (url, NULL); if (!u) return false; ret = getproxy (u) != NULL; - url_free (u); return ret; } @@ -1116,3 +1162,16 @@ no_proxy_match (const char *host, const char **no_proxy) else return sufmatch (no_proxy, host); } + +/* Set the file parameter to point to the local file string. */ +void +set_local_file (const char **file, const char *default_file) +{ + if (opt.output_document) + { + if (output_stream_regular) + *file = opt.output_document; + } + else + *file = default_file; +}