/* File retrieval.
- Copyright (C) 1995, 1996, 1997, 1998 Free Software Foundation, Inc.
+ Copyright (C) 1995, 1996, 1997, 1998, 2000 Free Software Foundation, Inc.
This file is part of Wget.
#include "ftp.h"
#include "host.h"
#include "connect.h"
+#include "hash.h"
+#ifdef WINDOWS
+LARGE_INTEGER internal_time;
+#else
/* Internal variables used by the timer. */
static long internal_secs, internal_msecs;
+#endif
void logflush PARAMS ((void));
static int show_progress PARAMS ((long, long, enum spflags));
+#define MIN(i, j) ((i) <= (j) ? (i) : (j))
+
/* Reads the contents of file descriptor FD, until it is closed, or a
read error occurs. The data is read in 8K chunks, and stored to
stream fp, which should have been open for writing. If BUF is
from fd immediately, flush or discard the buffer. */
int
get_contents (int fd, FILE *fp, long *len, long restval, long expected,
- struct rbuf *rbuf)
+ struct rbuf *rbuf, int use_expected)
{
- int res;
+ int res = 0;
static char c[8192];
*len = restval;
*len += res;
}
}
- /* Read from fd while there is available data. */
- do
+ /* Read from fd while there is available data.
+
+ Normally, if expected is 0, it means that it is not known how
+ much data is expected. However, if use_expected is specified,
+ then expected being zero means exactly that. */
+ while (!use_expected || (*len < expected))
{
- res = iread (fd, c, sizeof (c));
+ int amount_to_read = (use_expected
+ ? MIN (expected - *len, sizeof (c))
+ : sizeof (c));
+ res = iread (fd, c, amount_to_read);
if (res > 0)
{
if (fwrite (c, sizeof (char), res, fp) < res)
}
*len += res;
}
- } while (res > 0);
+ else
+ break;
+ }
if (res < -1)
res = -1;
if (opt.verbose)
void
reset_timer (void)
{
-#ifdef HAVE_GETTIMEOFDAY
+#ifndef WINDOWS
+ /* Under Unix, the preferred way to measure the passage of time is
+ through gettimeofday() because of its granularity. However, on
+ some old or weird systems, gettimeofday() might not be available.
+ There we use the simple time(). */
+# ifdef HAVE_GETTIMEOFDAY
struct timeval t;
gettimeofday (&t, NULL);
internal_secs = t.tv_sec;
internal_msecs = t.tv_usec / 1000;
-#else
+# else /* not HAVE_GETTIMEOFDAY */
internal_secs = time (NULL);
internal_msecs = 0;
-#endif
+# endif /* not HAVE_GETTIMEOFDAY */
+#else /* WINDOWS */
+ /* Under Windows, use Windows-specific APIs. */
+ FILETIME ft;
+ SYSTEMTIME st;
+ GetSystemTime(&st);
+ SystemTimeToFileTime(&st,&ft);
+ internal_time.HighPart = ft.dwHighDateTime;
+ internal_time.LowPart = ft.dwLowDateTime;
+#endif /* WINDOWS */
}
/* Return the time elapsed from the last call to reset_timer(), in
long
elapsed_time (void)
{
-#ifdef HAVE_GETTIMEOFDAY
+#ifndef WINDOWS
+# ifdef HAVE_GETTIMEOFDAY
struct timeval t;
gettimeofday (&t, NULL);
return ((t.tv_sec - internal_secs) * 1000
+ (t.tv_usec / 1000 - internal_msecs));
-#else
+# else /* not HAVE_GETTIMEOFDAY */
return 1000 * ((long)time (NULL) - internal_secs);
-#endif
+# endif /* not HAVE_GETTIMEOFDAY */
+#else /* WINDOWS */
+ FILETIME ft;
+ SYSTEMTIME st;
+ LARGE_INTEGER li;
+ GetSystemTime(&st);
+ SystemTimeToFileTime(&st,&ft);
+ li.HighPart = ft.dwHighDateTime;
+ li.LowPart = ft.dwLowDateTime;
+ return (long) ((li.QuadPart - internal_time.QuadPart) / 1e4);
+#endif /* WINDOWS */
}
/* Print out the appropriate download rate. Appropriate means that if
{
uerr_t result;
char *url;
- int location_changed, already_redirected, dummy;
+ int location_changed, dummy;
int local_use_proxy;
char *mynewloc, *proxy;
struct urlinfo *u;
-
+ struct hash_table *redirections = NULL;
/* If dt is NULL, just ignore it. */
if (!dt)
*newloc = NULL;
if (file)
*file = NULL;
- already_redirected = 0;
- again:
u = newurl ();
- /* Parse the URL. RFC2068 requires `Location' to contain an
- absoluteURI, but many sites break this requirement. #### We
- should be liberal and accept a relative location, too. */
- result = parseurl (url, u, already_redirected);
+ /* Parse the URL. */
+ result = parseurl (url, u, 0);
if (result != URLOK)
{
- freeurl (u, 1);
logprintf (LOG_NOTQUIET, "%s: %s.\n", url, uerrmsg (result));
+ freeurl (u, 1);
+ if (redirections)
+ string_set_free (redirections);
+ xfree (url);
return result;
}
+ redirected:
+
/* Set the referer. */
if (refurl)
u->referer = xstrdup (refurl);
{
logputs (LOG_NOTQUIET, _("Could not find proxy host.\n"));
freeurl (u, 1);
+ if (redirections)
+ string_set_free (redirections);
+ xfree (url);
return PROXERR;
}
/* Parse the proxy URL. */
else
logprintf (LOG_NOTQUIET, _("Proxy %s: Must be HTTP.\n"), proxy);
freeurl (u, 1);
+ if (redirections)
+ string_set_free (redirections);
+ xfree (url);
return PROXERR;
}
u->proto = URLHTTP;
retrieval, so we save recursion to oldrec, and restore it
later. */
int oldrec = opt.recursive;
- if (already_redirected)
+ if (redirections)
opt.recursive = 0;
result = ftp_loop (u, dt);
opt.recursive = oldrec;
#### All of this is, of course, crap. These types should be
determined through mailcap. */
- if (already_redirected && u->local && (u->proto == URLFTP ))
+ if (redirections && u->local && (u->proto == URLFTP ))
{
char *suf = suffix (u->local);
if (suf && (!strcasecmp (suf, "html") || !strcasecmp (suf, "htm")))
location_changed = (result == NEWLOCATION);
if (location_changed)
{
- /* Check for redirection to oneself. */
- if (url_equal (url, mynewloc))
+ char *construced_newloc;
+ uerr_t newloc_result;
+ struct urlinfo *newloc_struct;
+
+ assert (mynewloc != NULL);
+
+ /* The HTTP specs only allow absolute URLs to appear in
+ redirects, but a ton of boneheaded webservers and CGIs out
+ there break the rules and use relative URLs, and popular
+ browsers are lenient about this, so wget should be too. */
+ construced_newloc = url_concat (url, mynewloc);
+ xfree (mynewloc);
+ mynewloc = construced_newloc;
+
+ /* Now, see if this new location makes sense. */
+ newloc_struct = newurl ();
+ newloc_result = parseurl (mynewloc, newloc_struct, 1);
+ if (newloc_result != URLOK)
{
- logprintf (LOG_NOTQUIET, _("%s: Redirection to itself.\n"),
- mynewloc);
- return WRONGCODE;
+ logprintf (LOG_NOTQUIET, "%s: %s.\n", mynewloc, uerrmsg (newloc_result));
+ freeurl (newloc_struct, 1);
+ freeurl (u, 1);
+ if (redirections)
+ string_set_free (redirections);
+ xfree (url);
+ xfree (mynewloc);
+ return result;
}
- if (mynewloc)
+
+ /* Now mynewloc will become newloc_struct->url, because if the
+ Location contained relative paths like .././something, we
+ don't want that propagating as url. */
+ xfree (mynewloc);
+ mynewloc = xstrdup (newloc_struct->url);
+
+ if (!redirections)
{
- /* The HTTP specs only allow absolute URLs to appear in redirects, but
- a ton of boneheaded webservers and CGIs out there break the rules
- and use relative URLs, and popular browsers are lenient about this,
- so wget should be too. */
- if (strstr(mynewloc, "://") == NULL)
- /* Doesn't look like an absolute URL (this check will incorrectly
- think that rare relative URLs containing "://" later in the
- string are absolute). */
- {
- char *temp = malloc(strlen(url) + strlen(mynewloc) + 1);
-
- if (mynewloc[0] == '/')
- /* "Hostless absolute" URL. Convert to absolute. */
- sprintf(temp,"%s%s", url, mynewloc);
- else
- /* Relative URL. Convert to absolute. */
- sprintf(temp,"%s/%s", url, mynewloc);
-
- free(mynewloc);
- mynewloc = temp;
- }
-
- free (url);
- url = mynewloc;
+ redirections = make_string_hash_table (0);
+ /* Add current URL immediately so we can detect it as soon
+ as possible in case of a cycle. */
+ string_set_add (redirections, u->url);
+ }
+
+ /* The new location is OK. Let's check for redirection cycle by
+ peeking through the history of redirections. */
+ if (string_set_exists (redirections, newloc_struct->url))
+ {
+ logprintf (LOG_NOTQUIET, _("%s: Redirection cycle detected.\n"),
+ mynewloc);
+ freeurl (newloc_struct, 1);
+ freeurl (u, 1);
+ if (redirections)
+ string_set_free (redirections);
+ xfree (url);
+ xfree (mynewloc);
+ return WRONGCODE;
}
+ string_set_add (redirections, newloc_struct->url);
+
+ xfree (url);
+ url = mynewloc;
freeurl (u, 1);
- already_redirected = 1;
- goto again;
+ u = newloc_struct;
+ goto redirected;
}
+
if (file)
{
if (u->local)
*file = NULL;
}
freeurl (u, 1);
+ if (redirections)
+ string_set_free (redirections);
if (newloc)
*newloc = url;
else
- free (url);
+ xfree (url);
return result;
}
uerr_t status;
urlpos *url_list, *cur_url;
- /* If spider-mode is on, we do not want get_urls_html barfing
- errors on baseless links. */
- url_list = (html ? get_urls_html (file, NULL, opt.spider, FALSE)
+ url_list = (html ? get_urls_html (file, NULL, FALSE, NULL)
: get_urls_file (file));
status = RETROK; /* Suppose everything is OK. */
*count = 0; /* Reset the URL count. */
char *filename, *new_file;
int dt;
- if (opt.quota && opt.downloaded > opt.quota)
+ if (downloaded_exceeds_quota ())
{
status = QUOTEXC;
break;
{
logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
}
+
+/* Increment opt.downloaded by BY_HOW_MUCH. If an overflow occurs,
+ set opt.downloaded_overflow to 1. */
+void
+downloaded_increase (unsigned long by_how_much)
+{
+ VERY_LONG_TYPE old;
+ if (opt.downloaded_overflow)
+ return;
+ old = opt.downloaded;
+ opt.downloaded += by_how_much;
+ if (opt.downloaded < old) /* carry flag, where are you when I
+ need you? */
+ {
+ /* Overflow. */
+ opt.downloaded_overflow = 1;
+ opt.downloaded = ~((VERY_LONG_TYPE)0);
+ }
+}
+
+/* Return non-zero if the downloaded amount of bytes exceeds the
+ desired quota. If quota is not set or if the amount overflowed, 0
+ is returned. */
+int
+downloaded_exceeds_quota (void)
+{
+ if (!opt.quota)
+ return 0;
+ if (opt.downloaded_overflow)
+ /* We don't really know. (Wildly) assume not. */
+ return 0;
+
+ return opt.downloaded > opt.quota;
+}