/* HTTP support.
- Copyright (C) 1996-2005 Free Software Foundation, Inc.
+ Copyright (C) 1996-2006 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software Foundation, Inc.,
-51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
In addition, as a special exception, the Free Software Foundation
gives permission to link the code of its release of Wget with the
# include "gen-md5.h"
#endif
#include "convert.h"
+#include "spider.h"
#ifdef TESTING
#include "test.h"
xfree (resp);
}
+/* Print a single line of response, the characters [b, e). We tried
+ getting away with
+ logprintf (LOG_VERBOSE, "%s%.*s\n", prefix, (int) (e - b), b);
+ but that failed to escape the non-printable characters and, in fact,
+ caused crashes in UTF-8 locales. */
+
+static void
+print_response_line(const char *prefix, const char *b, const char *e)
+{
+ char *copy;
+ BOUNDED_TO_ALLOCA(b, e, copy);
+ logprintf (LOG_VERBOSE, "%s%s\n", prefix, escnonprint(copy));
+}
+
/* Print the server response, line by line, omitting the trailing CRLF
from individual header lines, and prefixed with PREFIX. */
--e;
if (b < e && e[-1] == '\r')
--e;
- /* This is safe even on printfs with broken handling of "%.<n>s"
- because resp->headers ends with \0. */
- logprintf (LOG_VERBOSE, "%s%.*s\n", prefix, e - b, b);
+ print_response_line(prefix, b, e);
}
}
bool
extract_param (const char **source, param_token *name, param_token *value,
- char separator)
+ char separator)
{
const char *p = *source;
if (!*p)
{
*source = p;
- return false; /* no error; nothing more to extract */
+ return false; /* no error; nothing more to extract */
}
/* Extract name. */
while (*p && !ISSPACE (*p) && *p != '=' && *p != separator) ++p;
name->e = p;
if (name->b == name->e)
- return false; /* empty name: error */
+ return false; /* empty name: error */
while (ISSPACE (*p)) ++p;
- if (*p == separator || !*p) /* no value */
+ if (*p == separator || !*p) /* no value */
{
xzero (*value);
if (*p == separator) ++p;
return true;
}
if (*p != '=')
- return false; /* error */
+ return false; /* error */
/* *p is '=', extract value */
++p;
while (ISSPACE (*p)) ++p;
- if (*p == '"') /* quoted */
+ if (*p == '"') /* quoted */
{
value->b = ++p;
while (*p && *p != '"') ++p;
while (ISSPACE (*p)) ++p;
while (*p && *p != separator) ++p;
if (*p == separator)
- ++p;
+ ++p;
else if (*p)
- /* garbage after closed quote, e.g. foo="bar"baz */
- return false;
+ /* garbage after closed quote, e.g. foo="bar"baz */
+ return false;
}
- else /* unquoted */
+ else /* unquoted */
{
value->b = p;
while (*p && *p != separator) ++p;
#undef MAX
#define MAX(p, q) ((p) > (q) ? (p) : (q))
+/* Parse the contents of the `Content-Disposition' header, extracting
+ the information useful to Wget. Content-Disposition is a header
+ borrowed from MIME; when used in HTTP, it typically serves for
+ specifying the desired file name of the resource. For example:
+
+ Content-Disposition: attachment; filename="flora.jpg"
+
+ Wget will skip the tokens it doesn't care about, such as
+ "attachment" in the previous example; it will also skip other
+ unrecognized params. If the header is syntactically correct and
+ contains a file name, a copy of the file name is stored in
+ *filename and true is returned. Otherwise, the function returns
+ false.
+
+ The file name is stripped of directory components and must not be
+ empty. */
+
static bool
parse_content_disposition (const char *hdr, char **filename)
{
while (extract_param (&hdr, &name, &value, ';'))
if (BOUNDED_EQUAL_NO_CASE (name.b, name.e, "filename") && value.b != NULL)
{
- /* Make the file name begin at the last slash or backslash. */
+ /* Make the file name begin at the last slash or backslash. */
const char *last_slash = memrchr (value.b, '/', value.e - value.b);
const char *last_bs = memrchr (value.b, '\\', value.e - value.b);
if (last_slash && last_bs)
value.b = 1 + MAX (last_slash, last_bs);
else if (last_slash || last_bs)
value.b = 1 + (last_slash ? last_slash : last_bs);
- if (value.b == value.e)
- continue;
- *filename = strdupdelim (value.b, value.e);
+ if (value.b == value.e)
+ continue;
+ /* Start with the directory prefix, if specified. */
+ if (opt.dir_prefix)
+ {
+ int prefix_length = strlen (opt.dir_prefix);
+ bool add_slash = (opt.dir_prefix[prefix_length - 1] != '/');
+ int total_length;
+
+ if (add_slash)
+ ++prefix_length;
+ total_length = prefix_length + (value.e - value.b);
+ *filename = xmalloc (total_length + 1);
+ strcpy (*filename, opt.dir_prefix);
+ if (add_slash)
+ (*filename)[prefix_length - 1] = '/';
+ memcpy (*filename + prefix_length, value.b, (value.e - value.b));
+ (*filename)[total_length] = '\0';
+ }
+ else
+ *filename = strdupdelim (value.b, value.e);
return true;
}
return false;
/* Determine the local filename if needed. Notice that if -O is used
* hstat.local_file is set by http_loop to the argument of -O. */
- if (!hs->local_file)
+ if (!hs->local_file)
{
/* Honor Content-Disposition whether possible. */
- if (!resp_header_copy (resp, "Content-Disposition", hdrval, sizeof (hdrval))
+ if (!opt.content_disposition
+ || !resp_header_copy (resp, "Content-Disposition",
+ hdrval, sizeof (hdrval))
|| !parse_content_disposition (hdrval, &hs->local_file))
{
- /* Choose filename according to URL name. */
+ /* The Content-Disposition header is missing or broken.
+ * Choose unique file name according to given URL. */
hs->local_file = url_file_name (u);
}
}
/* TODO: perform this check only once. */
- if (opt.noclobber && file_exists_p (hs->local_file))
+ if (file_exists_p (hs->local_file))
{
- /* If opt.noclobber is turned on and file already exists, do not
- retrieve the file */
- logprintf (LOG_VERBOSE, _("\
+ if (opt.noclobber)
+ {
+ /* If opt.noclobber is turned on and file already exists, do not
+ retrieve the file */
+ logprintf (LOG_VERBOSE, _("\
File `%s' already there; not retrieving.\n\n"), hs->local_file);
- /* If the file is there, we suppose it's retrieved OK. */
- *dt |= RETROKF;
+ /* If the file is there, we suppose it's retrieved OK. */
+ *dt |= RETROKF;
- /* #### Bogusness alert. */
- /* If its suffix is "html" or "htm" or similar, assume text/html. */
- if (has_html_suffix_p (hs->local_file))
- *dt |= TEXTHTML;
+ /* #### Bogusness alert. */
+ /* If its suffix is "html" or "htm" or similar, assume text/html. */
+ if (has_html_suffix_p (hs->local_file))
+ *dt |= TEXTHTML;
- return RETROK;
+ return RETROK;
+ }
+ else if (!ALLOW_CLOBBER)
+ {
+ char *unique = unique_name (hs->local_file, true);
+ if (unique != hs->local_file)
+ xfree (hs->local_file);
+ hs->local_file = unique;
+ }
}
/* Support timestamping */
/* Try to stat() the .orig file. */
if (stat (filename_plus_orig_suffix, &st) == 0)
{
- local_dot_orig_file_exists = 1;
+ local_dot_orig_file_exists = true;
local_filename = filename_plus_orig_suffix;
}
}
content-type. */
if (!type ||
0 == strncasecmp (type, TEXTHTML_S, strlen (TEXTHTML_S)) ||
- 0 == strncasecmp (type, TEXTXHTML_S, strlen (TEXTXHTML_S)))
+ 0 == strncasecmp (type, TEXTXHTML_S, strlen (TEXTXHTML_S)))
*dt |= TEXTHTML;
else
*dt &= ~TEXTHTML;
return RETRFINISHED;
}
- /* Print fetch message, if opt.verbose. */
- if (opt.verbose)
- {
- logprintf (LOG_NOTQUIET, _("Saving to: `%s'\n"),
- HYPHENP (hs->local_file) ? "STDOUT" : hs->local_file);
- }
-
/* Open the local file. */
if (!output_stream)
{
else
fp = output_stream;
+ /* Print fetch message, if opt.verbose. */
+ if (opt.verbose)
+ {
+ logprintf (LOG_NOTQUIET, _("Saving to: `%s'\n"),
+ HYPHENP (hs->local_file) ? "STDOUT" : hs->local_file);
+ }
+
/* This confuses the timestamping code that checks for file size.
#### The timestamping code should be smarter about file size. */
if (opt.save_headers && hs->restval == 0)
int *dt, struct url *proxy)
{
int count;
- bool got_head = false; /* used for time-stamping */
+ bool got_head = false; /* used for time-stamping and filename detection */
+ bool got_name = false;
char *tms;
const char *tmrate;
uerr_t err, ret = TRYLIMEXC;
hstat.referer = referer;
if (opt.output_document)
- hstat.local_file = xstrdup (opt.output_document);
+ {
+ hstat.local_file = xstrdup (opt.output_document);
+ got_name = true;
+ }
/* Reset the counter. */
count = 0;
sleep_between_retrievals (count);
/* Get the current time string. */
- tms = time_str (NULL);
+ tms = time_str (time (NULL));
+ if (opt.spider && !got_head)
+ logprintf (LOG_VERBOSE, _("\
+Spider mode enabled. Check if remote file exists.\n"));
+
/* Print fetch message, if opt.verbose. */
if (opt.verbose)
{
/* Default document type is empty. However, if spider mode is
on or time-stamping is employed, HEAD_ONLY commands is
encoded within *dt. */
- if (opt.spider || (opt.timestamping && !got_head))
+ if (((opt.spider || opt.timestamping) && !got_head) || !got_name)
*dt |= HEAD_ONLY;
else
*dt &= ~HEAD_ONLY;
/* Decide whether or not to restart. */
if (opt.always_rest
+ && got_name
&& stat (hstat.local_file, &st) == 0
&& S_ISREG (st.st_mode))
/* When -c is used, continue from on-disk size. (Can't use
err = gethttp (u, &hstat, dt, proxy);
/* Time? */
- tms = time_str (NULL);
+ tms = time_str (time (NULL));
/* Get the new location (with or without the redirection). */
if (hstat.newloc)
/* All possibilities should have been exhausted. */
abort ();
}
-
+
if (!(*dt & RETROKF))
{
+ char *hurl = NULL;
if (!opt.verbose)
{
/* #### Ugly ugly ugly! */
- char *hurl = url_string (u, true);
+ hurl = url_string (u, true);
logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
- xfree (hurl);
}
- logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
- tms, hstat.statcode, escnonprint (hstat.error));
+ /* Maybe we should always keep track of broken links, not just in
+ * spider mode. */
+ if (opt.spider)
+ {
+ /* #### Again: ugly ugly ugly! */
+ if (!hurl)
+ hurl = url_string (u, true);
+ nonexisting_url (hurl);
+ logprintf (LOG_NOTQUIET, _("\
+Remote file does not exist -- broken link!!!\n"));
+ }
+ else
+ {
+ logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
+ tms, hstat.statcode, escnonprint (hstat.error));
+ }
logputs (LOG_VERBOSE, "\n");
ret = WRONGCODE;
+ xfree_null (hurl);
goto exit;
}
/* Did we get the time-stamp? */
if (!got_head)
{
+ bool restart_loop = false;
+
if (opt.timestamping && !hstat.remote_time)
{
logputs (LOG_NOTQUIET, _("\
logputs (LOG_VERBOSE, _("\
Last-modified header invalid -- time-stamp ignored.\n"));
}
- }
-
- /* The time-stamping section. */
- if (opt.timestamping && !got_head)
- {
- got_head = true; /* no more time-stamping */
- *dt &= ~HEAD_ONLY;
- count = 0; /* the retrieve count for HEAD is reset */
-
- if (hstat.remote_time && tmr != (time_t) (-1))
+
+ /* The time-stamping section. */
+ if (opt.timestamping)
{
- /* Now time-stamping can be used validly. Time-stamping
- means that if the sizes of the local and remote file
- match, and local file is newer than the remote file,
- it will not be retrieved. Otherwise, the normal
- download procedure is resumed. */
- if (hstat.orig_file_tstamp >= tmr)
+ if (hstat.orig_file_name) /* Perform the following checks only
+ if the file we're supposed to
+ download already exists. */
{
- if (hstat.contlen == -1 || hstat.orig_file_size == hstat.contlen)
+ if (hstat.remote_time &&
+ tmr != (time_t) (-1))
{
- logprintf (LOG_VERBOSE, _("\
+ /* Now time-stamping can be used validly. Time-stamping
+ means that if the sizes of the local and remote file
+ match, and local file is newer than the remote file,
+ it will not be retrieved. Otherwise, the normal
+ download procedure is resumed. */
+ if (hstat.orig_file_tstamp >= tmr)
+ {
+ if (hstat.contlen == -1
+ || hstat.orig_file_size == hstat.contlen)
+ {
+ logprintf (LOG_VERBOSE, _("\
Server file no newer than local file `%s' -- not retrieving.\n\n"),
- hstat.orig_file_name);
- ret = RETROK;
- goto exit;
+ hstat.orig_file_name);
+ ret = RETROK;
+ goto exit;
+ }
+ else
+ {
+ logprintf (LOG_VERBOSE, _("\
+The sizes do not match (local %s) -- retrieving.\n"),
+ number_to_static_string (local_size));
+ }
+ }
+ else
+ logputs (LOG_VERBOSE,
+ _("Remote file is newer, retrieving.\n"));
+
+ logputs (LOG_VERBOSE, "\n");
}
- else
+ }
+
+ /* free_hstat (&hstat); */
+ hstat.timestamp_checked = true;
+ restart_loop = true;
+ }
+
+ if (opt.always_rest)
+ {
+ got_name = true;
+ restart_loop = true;
+ }
+
+ if (opt.spider)
+ {
+ if (opt.recursive)
+ {
+ if (*dt & TEXTHTML)
+ {
+ logputs (LOG_VERBOSE, _("\
+Remote file exists and could contain links to other resources -- retrieving.\n\n"));
+ restart_loop = true;
+ }
+ else
{
logprintf (LOG_VERBOSE, _("\
-The sizes do not match (local %s) -- retrieving.\n"),
- number_to_static_string (local_size));
+Remote file exists but does not contain any link -- not retrieving.\n\n"));
+ ret = RETRUNNEEDED;
+ goto exit;
}
}
else
- logputs (LOG_VERBOSE,
- _("Remote file is newer, retrieving.\n"));
-
- logputs (LOG_VERBOSE, "\n");
+ {
+ logprintf (LOG_VERBOSE, _("\
+Remote file exists but recursion is disabled -- not retrieving.\n\n"));
+ ret = RETRUNNEEDED;
+ goto exit;
+ }
}
-
- /* free_hstat (&hstat); */
- hstat.timestamp_checked = true;
- continue;
+
+ got_head = true; /* no more time-stamping */
+ *dt &= ~HEAD_ONLY;
+ count = 0; /* the retrieve count for HEAD is reset */
+
+ if (restart_loop)
+ continue;
}
-
+
if ((tmr != (time_t) (-1))
- && !opt.spider
&& ((hstat.len == hstat.contlen) ||
((hstat.res == 0) && (hstat.contlen == -1))))
{
}
/* End of time-stamping section. */
- if (opt.spider)
- {
- logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode,
- escnonprint (hstat.error));
- ret = RETROK;
- goto exit;
- }
-
tmrate = retr_rate (hstat.rd_size, hstat.dltime);
total_download_time += hstat.dltime;
{
int i;
for (i = 0; i < countof (options); i++)
- if (name.e - name.b == strlen (options[i].name)
- && 0 == strncmp (name.b, options[i].name, name.e - name.b))
- {
- *options[i].variable = strdupdelim (value.b, value.e);
- break;
- }
+ if (name.e - name.b == strlen (options[i].name)
+ && 0 == strncmp (name.b, options[i].name, name.e - name.b))
+ {
+ *options[i].variable = strdupdelim (value.b, value.e);
+ break;
+ }
}
if (!realm || !nonce || !user || !passwd || !path || !method)
{
#ifdef TESTING
-char *
+const char *
test_parse_content_disposition()
{
int i;
struct {
char *hdrval;
+ char *opt_dir_prefix;
char *filename;
bool result;
} test_array[] = {
- { "filename=\"file.ext\"", "file.ext", true },
- { "attachment; filename=\"file.ext\"", "file.ext", true },
- { "attachment; filename=\"file.ext\"; dummy", "file.ext", true },
- { "attachment", NULL, false },
+ { "filename=\"file.ext\"", NULL, "file.ext", true },
+ { "filename=\"file.ext\"", "somedir", "somedir/file.ext", true },
+ { "attachment; filename=\"file.ext\"", NULL, "file.ext", true },
+ { "attachment; filename=\"file.ext\"", "somedir", "somedir/file.ext", true },
+ { "attachment; filename=\"file.ext\"; dummy", NULL, "file.ext", true },
+ { "attachment; filename=\"file.ext\"; dummy", "somedir", "somedir/file.ext", true },
+ { "attachment", NULL, NULL, false },
+ { "attachment", "somedir", NULL, false },
};
for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
{
char *filename;
- bool res = parse_content_disposition (test_array[i].hdrval, &filename);
+ bool res;
+
+ opt.dir_prefix = test_array[i].opt_dir_prefix;
+ res = parse_content_disposition (test_array[i].hdrval, &filename);
mu_assert ("test_parse_content_disposition: wrong result",
res == test_array[i].result
&& (res == false
|| 0 == strcmp (test_array[i].filename, filename)));
-
- /* printf ("test %d: %s\n", i, res == false ? "false" : filename); */
}
return NULL;