/* File retrieval.
- Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
- 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
+ Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
+ 2005, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
This file is part of GNU Wget.
shall include the source code for the parts of OpenSSL used as well
as that of the covered work. */
-#include <config.h>
+#include "wget.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
-#include "wget.h"
+#include "exits.h"
#include "utils.h"
#include "retr.h"
#include "progress.h"
#include "hash.h"
#include "convert.h"
#include "ptimer.h"
+#include "html-url.h"
+#include "iri.h"
/* Total size of downloaded files. Used to enforce quota. */
SUM_SIZE_INT total_downloaded_bytes;
performance: fast downloads will arrive in large 16K chunks
(which stdio would write out immediately anyway), and slow
downloads wouldn't be limited by disk speed. */
+
+ /* 2005-04-20 SMS.
+ Perhaps it shouldn't hinder performance, but it sure does, at least
+ on VMS (more than 2X). Rather than speculate on what it should or
+ shouldn't do, it might make more sense to test it. Even better, it
+ might be nice to explain what possible benefit it could offer, as
+ it appears to be a clear invitation to poor performance with no
+ actual justification. (Also, why 16K? Anyone test other values?)
+ */
+#ifndef __VMS
fflush (out);
+#endif /* ndef __VMS */
return !ferror (out);
}
bool progress_interactive = false;
bool exact = !!(flags & rb_read_exactly);
+
+ /* Used only by HTTP/HTTPS chunked transfer encoding. */
+ bool chunked = flags & rb_chunked_transfer_encoding;
wgint skip = 0;
/* How much data we've read/written. */
wgint sum_read = 0;
wgint sum_written = 0;
+ wgint remaining_chunk_size = 0;
if (flags & rb_skip_startpos)
skip = startpos;
/* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
argument to progress_create because the indicator doesn't
(yet) know about "skipping" data. */
- progress = progress_create (skip ? 0 : startpos, startpos + toread);
+ wgint start = skip ? 0 : startpos;
+ progress = progress_create (start, start + toread);
progress_interactive = progress_interactive_p (progress);
}
should be read. */
while (!exact || (sum_read < toread))
{
- int rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
+ int rdsize;
double tmout = opt.read_timeout;
+
+ if (chunked)
+ {
+ if (remaining_chunk_size == 0)
+ {
+ char *line = fd_read_line (fd);
+ char *endl;
+ if (line == NULL)
+ {
+ ret = -1;
+ break;
+ }
+
+ remaining_chunk_size = strtol (line, &endl, 16);
+ if (remaining_chunk_size == 0)
+ {
+ ret = 0;
+ if (fd_read_line (fd) == NULL)
+ ret = -1;
+ break;
+ }
+ }
+
+ rdsize = MIN (remaining_chunk_size, dlbufsize);
+ }
+ else
+ rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
+
if (progress_interactive)
{
/* For interactive progress gauges, always specify a ~1s
else if (ret <= 0)
break; /* EOF or read error */
- if (progress || opt.limit_rate)
+ if (progress || opt.limit_rate || elapsed)
{
ptimer_measure (timer);
if (ret > 0)
ret = -2;
goto out;
}
+ if (chunked)
+ {
+ remaining_chunk_size -= ret;
+ if (remaining_chunk_size == 0)
+ if (fd_read_line (fd) == NULL)
+ {
+ ret = -1;
+ break;
+ }
+ }
}
if (opt.limit_rate)
char *hunk = xmalloc (bufsize);
int tail = 0; /* tail position in HUNK */
- assert (maxsize >= bufsize);
+ assert (!maxsize || maxsize >= bufsize);
while (1)
{
multiple points. */
uerr_t
-retrieve_url (const char *origurl, char **file, char **newloc,
- const char *refurl, int *dt, bool recursive)
+retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
+ char **newloc, const char *refurl, int *dt, bool recursive,
+ struct iri *iri, bool register_status)
{
uerr_t result;
char *url;
bool location_changed;
+ bool iri_fallbacked = 0;
int dummy;
char *mynewloc, *proxy;
- struct url *u, *proxy_url;
+ struct url *u = orig_parsed, *proxy_url;
int up_error_code; /* url parse error code */
char *local_file;
int redirection_count = 0;
if (file)
*file = NULL;
- u = url_parse (url, &up_error_code);
- if (!u)
- {
- logprintf (LOG_NOTQUIET, "%s: %s.\n", url, url_error (up_error_code));
- xfree (url);
- return URLERROR;
- }
-
if (!refurl)
refurl = opt.referer;
redirected:
+ /* (also for IRI fallbacking) */
result = NOCONERROR;
mynewloc = NULL;
proxy = getproxy (u);
if (proxy)
{
+ struct iri *pi = iri_new ();
+ set_uri_encoding (pi, opt.locale, true);
+ pi->utf8_encode = false;
+
/* Parse the proxy URL. */
- proxy_url = url_parse (proxy, &up_error_code);
+ proxy_url = url_parse (proxy, &up_error_code, NULL, true);
if (!proxy_url)
{
+ char *error = url_error (proxy, up_error_code);
logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
- proxy, url_error (up_error_code));
+ proxy, error);
xfree (url);
+ xfree (error);
RESTORE_POST_DATA;
- return PROXERR;
+ result = PROXERR;
+ goto bail;
}
if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
{
url_free (proxy_url);
xfree (url);
RESTORE_POST_DATA;
- return PROXERR;
+ result = PROXERR;
+ goto bail;
}
}
#endif
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP))
{
- result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
+ result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
}
else if (u->scheme == SCHEME_FTP)
{
if (redirection_count)
oldrec = glob = false;
- result = ftp_loop (u, dt, proxy_url, recursive, glob);
+ result = ftp_loop (u, &local_file, dt, proxy_url, recursive, glob);
recursive = oldrec;
/* There is a possibility of having HTTP being redirected to
xfree (mynewloc);
mynewloc = construced_newloc;
+ /* Reset UTF-8 encoding state, keep the URI encoding and reset
+ the content encoding. */
+ iri->utf8_encode = opt.enable_iri;
+ set_content_encoding (iri, NULL);
+ xfree_null (iri->orig_url);
+
/* Now, see if this new location makes sense. */
- newloc_parsed = url_parse (mynewloc, &up_error_code);
+ newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
if (!newloc_parsed)
{
+ char *error = url_error (mynewloc, up_error_code);
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
- url_error (up_error_code));
- url_free (u);
+ error);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
xfree (url);
xfree (mynewloc);
+ xfree (error);
RESTORE_POST_DATA;
- return result;
+ goto bail;
}
/* Now mynewloc will become newloc_parsed->url, because if the
logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
opt.max_redirect);
url_free (newloc_parsed);
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
xfree (url);
xfree (mynewloc);
RESTORE_POST_DATA;
- return WRONGCODE;
+ result = WRONGCODE;
+ goto bail;
}
xfree (url);
url = mynewloc;
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
u = newloc_parsed;
/* If we're being redirected from POST, we don't want to POST
goto redirected;
}
- if (local_file)
+ /* Try to not encode in UTF-8 if fetching failed */
+ if (!(*dt & RETROKF) && iri->utf8_encode)
{
+ iri->utf8_encode = false;
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
+ u = url_parse (origurl, NULL, iri, true);
+ if (u)
+ {
+ DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
+ url = xstrdup (u->url);
+ iri_fallbacked = 1;
+ goto redirected;
+ }
+ else
+ DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
+ }
+
+ if (local_file && *dt & RETROKF)
+ {
+ register_download (u->url, local_file);
+ if (redirection_count && 0 != strcmp (origurl, u->url))
+ register_redirection (origurl, u->url);
+ if (*dt & TEXTHTML)
+ register_html (u->url, local_file);
if (*dt & RETROKF)
{
register_download (u->url, local_file);
register_redirection (origurl, u->url);
if (*dt & TEXTHTML)
register_html (u->url, local_file);
+ if (*dt & TEXTCSS)
+ register_css (u->url, local_file);
}
}
else
xfree_null (local_file);
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
- if (redirection_count)
+ if (redirection_count || iri_fallbacked)
{
if (newloc)
*newloc = url;
RESTORE_POST_DATA;
+bail:
+ if (register_status)
+ inform_exit_status (result);
return result;
}
{
uerr_t status;
struct urlpos *url_list, *cur_url;
+ struct iri *iri = iri_new();
+
+ char *input_file, *url_file = NULL;
+ const char *url = file;
- url_list = (html ? get_urls_html (file, NULL, NULL)
- : get_urls_file (file));
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
+ /* sXXXav : Assume filename and links in the file are in the locale */
+ set_uri_encoding (iri, opt.locale, true);
+ set_content_encoding (iri, opt.locale);
+
+ if (url_valid_scheme (url))
+ {
+ int dt,url_err;
+ uerr_t status;
+ struct url * url_parsed = url_parse(url, &url_err, iri, true);
+
+ if (!url_parsed)
+ {
+ char *error = url_error (url, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
+ xfree (error);
+ return URLERROR;
+ }
+
+ if (!opt.base_href)
+ opt.base_href = xstrdup (url);
+
+ status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
+ false, iri, true);
+ url_free (url_parsed);
+
+ if (!url_file || (status != RETROK))
+ return status;
+
+ if (dt & TEXTHTML)
+ html = true;
+
+ /* If we have a found a content encoding, use it.
+ * ( == is okay, because we're checking for identical object) */
+ if (iri->content_encoding != opt.locale)
+ set_uri_encoding (iri, iri->content_encoding, false);
+
+ /* Reset UTF-8 encode status */
+ iri->utf8_encode = opt.enable_iri;
+ xfree_null (iri->orig_url);
+ iri->orig_url = NULL;
+
+ input_file = url_file;
+ }
+ else
+ input_file = (char *) file;
+
+ url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
+ : get_urls_file (input_file));
+
+ xfree_null (url_file);
+
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
{
char *filename = NULL, *new_file = NULL;
int dt;
+ struct iri *tmpiri = iri_dup (iri);
+ struct url *parsed_url = NULL;
if (cur_url->ignore_when_downloading)
continue;
status = QUOTEXC;
break;
}
+
+ /* Need to reparse the url, since it didn't have iri information. */
+ if (opt.enable_iri)
+ parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);
+
if ((opt.recursive || opt.page_requisites)
&& (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
{
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- if (cur_url->url->scheme == SCHEME_FTP)
+ if (cur_url->url->scheme == SCHEME_FTP)
opt.follow_ftp = 1;
-
- status = retrieve_tree (cur_url->url->url);
+
+ status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
+ tmpiri);
opt.follow_ftp = old_follow_ftp;
}
else
- status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
+ status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
+ cur_url->url->url, &filename,
+ &new_file, NULL, &dt, opt.recursive, tmpiri,
+ true);
+
+ if (parsed_url)
+ url_free (parsed_url);
if (filename && opt.delete_after && file_exists_p (filename))
{
xfree_null (new_file);
xfree_null (filename);
+ iri_free (tmpiri);
}
/* Free the linked list of URL-s. */
free_urlpos (url_list);
+ iri_free (iri);
+
return status;
}
/* Returns true if URL would be downloaded through a proxy. */
bool
-url_uses_proxy (const char *url)
+url_uses_proxy (struct url * u)
{
bool ret;
- struct url *u = url_parse (url, NULL);
if (!u)
return false;
ret = getproxy (u) != NULL;
- url_free (u);
return ret;
}
else
return sufmatch (no_proxy, host);
}
+
+/* Set the file parameter to point to the local file string. */
+void
+set_local_file (const char **file, const char *default_file)
+{
+ if (opt.output_document)
+ {
+ if (output_stream_regular)
+ *file = opt.output_document;
+ }
+ else
+ *file = default_file;
+}
+
+/* Return true for an input file's own URL, false otherwise. */
+bool
+input_file_url (const char *input_file)
+{
+ static bool first = true;
+
+ if (input_file
+ && url_has_scheme (input_file)
+ && first)
+ {
+ first = false;
+ return true;
+ }
+ else
+ return false;
+}