/* File retrieval.
- Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
- 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+ Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
+ 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation,
+ Inc.
This file is part of GNU Wget.
#include <stdio.h>
#include <stdlib.h>
-#ifdef HAVE_UNISTD_H
-# include <unistd.h>
-#endif /* HAVE_UNISTD_H */
+#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <assert.h>
+#ifdef VMS
+# include <unixio.h> /* For delete(). */
+#endif
+#include "exits.h"
#include "utils.h"
#include "retr.h"
#include "progress.h"
#include "convert.h"
#include "ptimer.h"
#include "html-url.h"
+#include "iri.h"
/* Total size of downloaded files. Used to enforce quota. */
SUM_SIZE_INT total_downloaded_bytes;
/* Write data in BUF to OUT. However, if *SKIP is non-zero, skip that
amount of data and decrease SKIP. Increment *TOTAL by the amount
- of data written. */
+ of data written. If OUT2 is not NULL, also write BUF to OUT2.
+ In case of error writing to OUT, -1 is returned. In case of error
+ writing to OUT2, -2 is returned. Return 1 if the whole BUF was
+ skipped. */
static int
-write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
- wgint *written)
+write_data (FILE *out, FILE *out2, const char *buf, int bufsize,
+ wgint *skip, wgint *written)
{
- if (!out)
+ if (out == NULL && out2 == NULL)
return 1;
if (*skip > bufsize)
{
return 1;
}
- fwrite (buf, 1, bufsize, out);
+ if (out != NULL)
+ fwrite (buf, 1, bufsize, out);
+ if (out2 != NULL)
+ fwrite (buf, 1, bufsize, out2);
*written += bufsize;
/* Immediately flush the downloaded data. This should not hinder
performance: fast downloads will arrive in large 16K chunks
(which stdio would write out immediately anyway), and slow
downloads wouldn't be limited by disk speed. */
- fflush (out);
- return !ferror (out);
+
+ /* 2005-04-20 SMS.
+ Perhaps it shouldn't hinder performance, but it sure does, at least
+ on VMS (more than 2X). Rather than speculate on what it should or
+ shouldn't do, it might make more sense to test it. Even better, it
+ might be nice to explain what possible benefit it could offer, as
+ it appears to be a clear invitation to poor performance with no
+ actual justification. (Also, why 16K? Anyone test other values?)
+ */
+#ifndef __VMS
+ if (out != NULL)
+ fflush (out);
+ if (out2 != NULL)
+ fflush (out2);
+#endif /* ndef __VMS */
+ if (out != NULL && ferror (out))
+ return -1;
+ else if (out2 != NULL && ferror (out2))
+ return -2;
+ else
+ return 0;
}
/* Read the contents of file descriptor FD until it the connection
the amount of data written to disk. The time it took to download
the data is stored to ELAPSED.
+ If OUT2 is non-NULL, the contents is also written to OUT2.
+ OUT2 will get an exact copy of the response: if this is a chunked
+ response, everything -- including the chunk headers -- is written
+ to OUT2. (OUT will only get the unchunked response.)
+
The function exits and returns the amount of data read. In case of
error while reading data, -1 is returned. In case of error while
- writing data, -2 is returned. */
+ writing data to OUT, -2 is returned. In case of error while writing
+ data to OUT2, -3 is returned. */
int
-fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
- wgint *qtyread, wgint *qtywritten, double *elapsed, int flags)
+fd_read_body (const char *downloaded_filename, int fd, FILE *out, wgint toread, wgint startpos,
+
+ wgint *qtyread, wgint *qtywritten, double *elapsed, int flags,
+ FILE *out2)
{
int ret = 0;
-
- static char dlbuf[16384];
- int dlbufsize = sizeof (dlbuf);
+#undef max
+#define max(a,b) ((a) > (b) ? (a) : (b))
+ int dlbufsize = max (BUFSIZ, 8 * 1024);
+ char *dlbuf = xmalloc (dlbufsize);
struct ptimer *timer = NULL;
double last_successful_read_tm = 0;
bool progress_interactive = false;
bool exact = !!(flags & rb_read_exactly);
+
+ /* Used only by HTTP/HTTPS chunked transfer encoding. */
+ bool chunked = flags & rb_chunked_transfer_encoding;
wgint skip = 0;
/* How much data we've read/written. */
wgint sum_read = 0;
wgint sum_written = 0;
+ wgint remaining_chunk_size = 0;
if (flags & rb_skip_startpos)
skip = startpos;
- if (opt.verbose)
+ if (opt.show_progress)
{
/* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
argument to progress_create because the indicator doesn't
(yet) know about "skipping" data. */
- progress = progress_create (skip ? 0 : startpos, startpos + toread);
+ wgint start = skip ? 0 : startpos;
+ progress = progress_create (downloaded_filename, start, start + toread);
progress_interactive = progress_interactive_p (progress);
}
should be read. */
while (!exact || (sum_read < toread))
{
- int rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
+ int rdsize;
double tmout = opt.read_timeout;
+
+ if (chunked)
+ {
+ if (remaining_chunk_size == 0)
+ {
+ char *line = fd_read_line (fd);
+ char *endl;
+ if (line == NULL)
+ {
+ ret = -1;
+ break;
+ }
+ else if (out2 != NULL)
+ fwrite (line, 1, strlen (line), out2);
+
+ remaining_chunk_size = strtol (line, &endl, 16);
+ xfree (line);
+
+ if (remaining_chunk_size == 0)
+ {
+ ret = 0;
+ line = fd_read_line (fd);
+ if (line == NULL)
+ ret = -1;
+ else
+ {
+ if (out2 != NULL)
+ fwrite (line, 1, strlen (line), out2);
+ xfree (line);
+ }
+ break;
+ }
+ }
+
+ rdsize = MIN (remaining_chunk_size, dlbufsize);
+ }
+ else
+ rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
+
if (progress_interactive)
{
/* For interactive progress gauges, always specify a ~1s
else if (ret <= 0)
break; /* EOF or read error */
- if (progress || opt.limit_rate)
+ if (progress || opt.limit_rate || elapsed)
{
ptimer_measure (timer);
if (ret > 0)
if (ret > 0)
{
sum_read += ret;
- if (!write_data (out, dlbuf, ret, &skip, &sum_written))
+ int write_res = write_data (out, out2, dlbuf, ret, &skip, &sum_written);
+ if (write_res < 0)
{
- ret = -2;
+ ret = (write_res == -3) ? -3 : -2;
goto out;
}
+ if (chunked)
+ {
+ remaining_chunk_size -= ret;
+ if (remaining_chunk_size == 0)
+ {
+ char *line = fd_read_line (fd);
+ if (line == NULL)
+ {
+ ret = -1;
+ break;
+ }
+ else
+ {
+ if (out2 != NULL)
+ fwrite (line, 1, strlen (line), out2);
+ xfree (line);
+ }
+ }
+ }
}
if (opt.limit_rate)
if (progress)
progress_update (progress, ret, ptimer_read (timer));
#ifdef WINDOWS
- if (toread > 0 && !opt.quiet)
+ if (toread > 0 && opt.show_progress)
ws_percenttitle (100.0 *
(startpos + sum_read) / (startpos + toread));
#endif
if (qtywritten)
*qtywritten += sum_written;
+ free (dlbuf);
+
return ret;
}
\f
}
static const char *
-line_terminator (const char *start, const char *peeked, int peeklen)
+line_terminator (const char *start _GL_UNUSED, const char *peeked, int peeklen)
{
const char *p = memchr (peeked, '\n', peeklen);
if (p)
{
static char res[20];
static const char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
+ static const char *rate_names_bits[] = {"b/s", "Kb/s", "Mb/s", "Gb/s" };
int units;
double dlrate = calc_rate (bytes, secs, &units);
e.g. "1022", "247", "12.5", "2.38". */
sprintf (res, "%.*f %s",
dlrate >= 99.95 ? 0 : dlrate >= 9.995 ? 1 : 2,
- dlrate, rate_names[units]);
+ dlrate, !opt.report_bps ? rate_names[units]: rate_names_bits[units]);
return res;
}
calc_rate (wgint bytes, double secs, int *units)
{
double dlrate;
+ double bibyte = 1000.0;
+
+ if (!opt.report_bps)
+ bibyte = 1024.0;
+
assert (secs >= 0);
assert (bytes >= 0);
0 and the timer's resolution, assume half the resolution. */
secs = ptimer_resolution () / 2.0;
- dlrate = bytes / secs;
- if (dlrate < 1024.0)
+ dlrate = convert_to_bits (bytes) / secs;
+ if (dlrate < bibyte)
*units = 0;
- else if (dlrate < 1024.0 * 1024.0)
- *units = 1, dlrate /= 1024.0;
- else if (dlrate < 1024.0 * 1024.0 * 1024.0)
- *units = 2, dlrate /= (1024.0 * 1024.0);
+ else if (dlrate < (bibyte * bibyte))
+ *units = 1, dlrate /= bibyte;
+ else if (dlrate < (bibyte * bibyte * bibyte))
+ *units = 2, dlrate /= (bibyte * bibyte);
+
else
/* Maybe someone will need this, one day. */
- *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
+ *units = 3, dlrate /= (bibyte * bibyte * bibyte);
return dlrate;
}
\f
-#define SUSPEND_POST_DATA do { \
- post_data_suspended = true; \
- saved_post_data = opt.post_data; \
- saved_post_file_name = opt.post_file_name; \
- opt.post_data = NULL; \
- opt.post_file_name = NULL; \
+#define SUSPEND_METHOD do { \
+ method_suspended = true; \
+ saved_body_data = opt.body_data; \
+ saved_body_file_name = opt.body_file; \
+ saved_method = opt.method; \
+ opt.body_data = NULL; \
+ opt.body_file = NULL; \
+ opt.method = NULL; \
} while (0)
-#define RESTORE_POST_DATA do { \
- if (post_data_suspended) \
+#define RESTORE_METHOD do { \
+ if (method_suspended) \
{ \
- opt.post_data = saved_post_data; \
- opt.post_file_name = saved_post_file_name; \
- post_data_suspended = false; \
+ opt.body_data = saved_body_data; \
+ opt.body_file = saved_body_file_name; \
+ opt.method = saved_method; \
+ method_suspended = false; \
} \
} while (0)
multiple points. */
uerr_t
-retrieve_url (const char *origurl, char **file, char **newloc,
- const char *refurl, int *dt, bool recursive)
+retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
+ char **newloc, const char *refurl, int *dt, bool recursive,
+ struct iri *iri, bool register_status)
{
uerr_t result;
char *url;
bool location_changed;
+ bool iri_fallbacked = 0;
int dummy;
char *mynewloc, *proxy;
- struct url *u, *proxy_url;
+ struct url *u = orig_parsed, *proxy_url;
int up_error_code; /* url parse error code */
char *local_file;
int redirection_count = 0;
- bool post_data_suspended = false;
- char *saved_post_data = NULL;
- char *saved_post_file_name = NULL;
+ bool method_suspended = false;
+ char *saved_body_data = NULL;
+ char *saved_method = NULL;
+ char *saved_body_file_name = NULL;
/* If dt is NULL, use local storage. */
if (!dt)
if (file)
*file = NULL;
- u = url_parse (url, &up_error_code);
- if (!u)
- {
- char *error = url_error (url, up_error_code);
- logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
- xfree (url);
- xfree (error);
- return URLERROR;
- }
-
if (!refurl)
refurl = opt.referer;
redirected:
+ /* (also for IRI fallbacking) */
result = NOCONERROR;
mynewloc = NULL;
proxy = getproxy (u);
if (proxy)
{
+ struct iri *pi = iri_new ();
+ set_uri_encoding (pi, opt.locale, true);
+ pi->utf8_encode = false;
+
/* Parse the proxy URL. */
- proxy_url = url_parse (proxy, &up_error_code);
+ proxy_url = url_parse (proxy, &up_error_code, NULL, true);
if (!proxy_url)
{
char *error = url_error (proxy, up_error_code);
proxy, error);
xfree (url);
xfree (error);
- RESTORE_POST_DATA;
- return PROXERR;
+ RESTORE_METHOD;
+ result = PROXERR;
+ goto bail;
}
if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
{
logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
url_free (proxy_url);
xfree (url);
- RESTORE_POST_DATA;
- return PROXERR;
+ RESTORE_METHOD;
+ result = PROXERR;
+ goto bail;
}
}
#endif
|| (proxy_url && proxy_url->scheme == SCHEME_HTTP))
{
- result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url);
+ result = http_loop (u, orig_parsed, &mynewloc, &local_file, refurl, dt,
+ proxy_url, iri);
}
else if (u->scheme == SCHEME_FTP)
{
if (redirection_count)
oldrec = glob = false;
- result = ftp_loop (u, dt, proxy_url, recursive, glob);
+ result = ftp_loop (u, &local_file, dt, proxy_url, recursive, glob);
recursive = oldrec;
/* There is a possibility of having HTTP being redirected to
proxy_url = NULL;
}
- location_changed = (result == NEWLOCATION);
+ location_changed = (result == NEWLOCATION || result == NEWLOCATION_KEEP_POST);
if (location_changed)
{
char *construced_newloc;
xfree (mynewloc);
mynewloc = construced_newloc;
+ /* Reset UTF-8 encoding state, keep the URI encoding and reset
+ the content encoding. */
+ iri->utf8_encode = opt.enable_iri;
+ set_content_encoding (iri, NULL);
+ xfree_null (iri->orig_url);
+ iri->orig_url = NULL;
+
/* Now, see if this new location makes sense. */
- newloc_parsed = url_parse (mynewloc, &up_error_code);
+ newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
if (!newloc_parsed)
{
char *error = url_error (mynewloc, up_error_code);
logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
error);
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
xfree (url);
xfree (mynewloc);
xfree (error);
- RESTORE_POST_DATA;
- return result;
+ RESTORE_METHOD;
+ goto bail;
}
/* Now mynewloc will become newloc_parsed->url, because if the
logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
opt.max_redirect);
url_free (newloc_parsed);
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
xfree (url);
xfree (mynewloc);
- RESTORE_POST_DATA;
- return WRONGCODE;
+ RESTORE_METHOD;
+ result = WRONGCODE;
+ goto bail;
}
xfree (url);
url = mynewloc;
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
u = newloc_parsed;
- /* If we're being redirected from POST, we don't want to POST
+ /* If we're being redirected from POST, and we received a
+ redirect code different than 307, we don't want to POST
again. Many requests answer POST with a redirection to an
index page; that redirection is clearly a GET. We "suspend"
POST data for the duration of the redirections, and restore
- it when we're done. */
- if (!post_data_suspended)
- SUSPEND_POST_DATA;
+ it when we're done.
+
+ RFC2616 HTTP/1.1 introduces code 307 Temporary Redirect
+ specifically to preserve the method of the request.
+ */
+ if (result != NEWLOCATION_KEEP_POST && !method_suspended)
+ SUSPEND_METHOD;
goto redirected;
}
- if (local_file)
+ /* Try to not encode in UTF-8 if fetching failed */
+ if (!(*dt & RETROKF) && iri->utf8_encode)
{
- if (*dt & RETROKF)
+ iri->utf8_encode = false;
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
+ u = url_parse (origurl, NULL, iri, true);
+ if (u)
{
- register_download (u->url, local_file);
- if (redirection_count && 0 != strcmp (origurl, u->url))
- register_redirection (origurl, u->url);
- if (*dt & TEXTHTML)
- register_html (u->url, local_file);
- if (*dt & TEXTCSS)
- register_css (u->url, local_file);
+ DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
+ url = xstrdup (u->url);
+ iri_fallbacked = 1;
+ goto redirected;
}
+ else
+ DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
+ }
+
+ if (local_file && u && *dt & RETROKF)
+ {
+ register_download (u->url, local_file);
+
+ if (!opt.spider && redirection_count && 0 != strcmp (origurl, u->url))
+ register_redirection (origurl, u->url);
+
+ if (*dt & TEXTHTML)
+ register_html (local_file);
+
+ if (*dt & TEXTCSS)
+ register_css (local_file);
}
if (file)
else
xfree_null (local_file);
- url_free (u);
+ if (orig_parsed != u)
+ {
+ url_free (u);
+ }
- if (redirection_count)
+ if (redirection_count || iri_fallbacked)
{
if (newloc)
*newloc = url;
xfree (url);
}
- RESTORE_POST_DATA;
+ RESTORE_METHOD;
+bail:
+ if (register_status)
+ inform_exit_status (result);
return result;
}
{
uerr_t status;
struct urlpos *url_list, *cur_url;
+ struct iri *iri = iri_new();
- char *input_file = NULL;
+ char *input_file, *url_file = NULL;
const char *url = file;
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
-
- if (url_has_scheme (url))
+
+ /* sXXXav : Assume filename and links in the file are in the locale */
+ set_uri_encoding (iri, opt.locale, true);
+ set_content_encoding (iri, opt.locale);
+
+ if (url_valid_scheme (url))
{
- int dt;
- uerr_t status;
+ int dt,url_err;
+ struct url *url_parsed = url_parse (url, &url_err, iri, true);
+ if (!url_parsed)
+ {
+ char *error = url_error (url, url_err);
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
+ xfree (error);
+ return URLERROR;
+ }
if (!opt.base_href)
opt.base_href = xstrdup (url);
- status = retrieve_url (url, &input_file, NULL, NULL, &dt, false);
- if (status != RETROK)
+ status = retrieve_url (url_parsed, url, &url_file, NULL, NULL, &dt,
+ false, iri, true);
+ url_free (url_parsed);
+
+ if (!url_file || (status != RETROK))
return status;
if (dt & TEXTHTML)
html = true;
+
+ /* If we have a found a content encoding, use it.
+ * ( == is okay, because we're checking for identical object) */
+ if (iri->content_encoding != opt.locale)
+ set_uri_encoding (iri, iri->content_encoding, false);
+
+ /* Reset UTF-8 encode status */
+ iri->utf8_encode = opt.enable_iri;
+ xfree_null (iri->orig_url);
+ iri->orig_url = NULL;
+
+ input_file = url_file;
}
else
input_file = (char *) file;
- url_list = (html ? get_urls_html (input_file, NULL, NULL)
+ url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
: get_urls_file (input_file));
+ xfree_null (url_file);
+
for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
{
char *filename = NULL, *new_file = NULL;
int dt;
+ struct iri *tmpiri = iri_dup (iri);
+ struct url *parsed_url = NULL;
if (cur_url->ignore_when_downloading)
continue;
status = QUOTEXC;
break;
}
+
+ parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);
+
if ((opt.recursive || opt.page_requisites)
&& (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
{
int old_follow_ftp = opt.follow_ftp;
/* Turn opt.follow_ftp on in case of recursive FTP retrieval */
- if (cur_url->url->scheme == SCHEME_FTP)
+ if (cur_url->url->scheme == SCHEME_FTP)
opt.follow_ftp = 1;
-
- status = retrieve_tree (cur_url->url->url);
+
+ status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
+ tmpiri);
opt.follow_ftp = old_follow_ftp;
}
else
- status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
+ status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
+ cur_url->url->url, &filename,
+ &new_file, NULL, &dt, opt.recursive, tmpiri,
+ true);
+
+ if (parsed_url)
+ url_free (parsed_url);
if (filename && opt.delete_after && file_exists_p (filename))
{
xfree_null (new_file);
xfree_null (filename);
+ iri_free (tmpiri);
}
/* Free the linked list of URL-s. */
free_urlpos (url_list);
+ iri_free (iri);
+
return status;
}
void
rotate_backups(const char *fname)
{
- int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
+#ifdef __VMS
+# define SEP "_"
+# define AVS ";*" /* All-version suffix. */
+# define AVSL (sizeof (AVS) - 1)
+#else
+# define SEP "."
+# define AVSL 0
+#endif
+
+ int maxlen = strlen (fname) + sizeof (SEP) + numdigit (opt.backups) + AVSL;
char *from = (char *)alloca (maxlen);
char *to = (char *)alloca (maxlen);
struct_stat sb;
for (i = opt.backups; i > 1; i--)
{
- sprintf (from, "%s.%d", fname, i - 1);
- sprintf (to, "%s.%d", fname, i);
+#ifdef VMS
+ /* Delete (all versions of) any existing max-suffix file, to avoid
+ * creating multiple versions of it. (On VMS, rename() will
+ * create a new version of an existing destination file, not
+ * destroy/overwrite it.)
+ */
+ if (i == opt.backups)
+ {
+ sprintf (to, "%s%s%d%s", fname, SEP, i, AVS);
+ delete (to);
+ }
+#endif
+ sprintf (to, "%s%s%d", fname, SEP, i);
+ sprintf (from, "%s%s%d", fname, SEP, i - 1);
rename (from, to);
}
- sprintf (to, "%s.%d", fname, 1);
+ sprintf (to, "%s%s%d", fname, SEP, 1);
rename(fname, to);
}
/* Returns true if URL would be downloaded through a proxy. */
bool
-url_uses_proxy (const char *url)
+url_uses_proxy (struct url * u)
{
bool ret;
- struct url *u = url_parse (url, NULL);
if (!u)
return false;
ret = getproxy (u) != NULL;
- url_free (u);
return ret;
}
else
*file = default_file;
}
+
+/* Return true for an input file's own URL, false otherwise. */
+bool
+input_file_url (const char *input_file)
+{
+ static bool first = true;
+
+ if (input_file
+ && url_has_scheme (input_file)
+ && first)
+ {
+ first = false;
+ return true;
+ }
+ else
+ return false;
+}