/* Handling of recursive HTTP retrieving.
- Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
+ Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
This file is part of Wget.
}
/* The core of recursive retrieving. Endless recursion is avoided by
- having all URL-s stored to a linked list of URL-s, which is checked
+ having all URLs stored to a linked list of URLs, which is checked
before loading any URL. That way no URL can get loaded twice.
The function also supports specification of maximum recursion depth
{
char *constr, *filename, *newloc;
char *canon_this_url = NULL;
- int dt, inl;
+ int dt, inl, dash_p_leaf_HTML = FALSE;
int this_url_ftp; /* See below the explanation */
uerr_t err;
struct urlinfo *rurl;
assert (this_url != NULL);
assert (file != NULL);
/* If quota was exceeded earlier, bail out. */
- if (opt.quota && (opt.downloaded > opt.quota))
+ if (downloaded_exceeds_quota ())
return QUOTEXC;
/* Cache the current URL in the list. */
if (first_time)
else
++depth;
- /* Bail out if opt.reclevel is exceeded. */
- if ((opt.reclevel != 0) && (depth > opt.reclevel))
+ if (opt.reclevel != INFINITE_RECURSION && depth > opt.reclevel)
+ /* We've exceeded the maximum recursion depth specified by the user. */
{
- DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
- depth, opt.reclevel));
- --depth;
- return RECLEVELEXC;
+ if (opt.page_requisites && depth <= opt.reclevel + 1)
+ /* When -p is specified, we can do one more partial recursion from the
+ "leaf nodes" on the HTML document tree. The recursion is partial in
+ that we won't traverse any <A> or <AREA> tags, nor any <LINK> tags
+ except for <LINK REL="stylesheet">. */
+ dash_p_leaf_HTML = TRUE;
+ else
+ /* Either -p wasn't specified or it was and we've already gone the one
+ extra (pseudo-)level that it affords us, so we need to bail out. */
+ {
+ DEBUGP (("Recursion depth %d exceeded max. depth %d.\n",
+ depth, opt.reclevel));
+ --depth;
+ return RECLEVELEXC;
+ }
}
/* Determine whether this_url is an FTP URL. If it is, it means
this_url_ftp = (urlproto (this_url) == URLFTP);
/* Get the URL-s from an HTML file: */
- url_list = get_urls_html (file,
- canon_this_url ? canon_this_url : this_url, 0);
+ url_list = get_urls_html (file, canon_this_url ? canon_this_url : this_url,
+ 0, dash_p_leaf_HTML);
/* Decide what to do with each of the URLs. A URL will be loaded if
it meets several requirements, discussed later. */
for (cur_url = url_list; cur_url; cur_url = cur_url->next)
{
/* If quota was exceeded earlier, bail out. */
- if (opt.quota && (opt.downloaded > opt.quota))
+ if (downloaded_exceeds_quota ())
break;
/* Parse the URL for convenient use in other functions, as well
as to get the optimized form. It also checks URL integrity. */
(!*u->file
|| (((suf = suffix (constr)) != NULL)
&& ((!strcmp (suf, "html") || !strcmp (suf, "htm"))
- && ((opt.reclevel != 0) && (depth != opt.reclevel))))))
+ && ((opt.reclevel != INFINITE_RECURSION) &&
+ (depth != opt.reclevel))))))
{
if (!acceptable (u->file))
{
char *p;
/* Just lowercase the hostname. */
for (p = u->host; *p; p++)
- *p = tolower (*p);
+ *p = TOLOWER (*p);
free (u->url);
u->url = str_url (u, 0);
}
else
DEBUGP (("%s is not text/html so we don't chase.\n",
filename ? filename: "(null)"));
- /* If an suffix-rejected file was loaded only because it was HTML,
- undo the error now */
+
if (opt.delete_after || (filename && !acceptable (filename)))
+ /* Either --delete-after was specified, or we loaded this otherwise
+ rejected (e.g. by -R) HTML file just so we could harvest its
+ hyperlinks -- in either case, delete the local file. */
{
+ DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
+ opt.delete_after ? "--delete-after" :
+ "recursive rejection criteria"));
logprintf (LOG_VERBOSE,
(opt.delete_after ? _("Removing %s.\n")
: _("Removing %s since it should be rejected.\n")),
logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
dt &= ~RETROKF;
}
+
/* If everything was OK, and links are to be converted, let's
store the local filename. */
if (opt.convert_links && (dt & RETROKF) && (filename != NULL))
freeurl (u, 1);
/* Increment the pbuf for the appropriate size. */
}
- if (opt.convert_links)
+ if (opt.convert_links && !opt.delete_after)
convert_links (file, url_list);
/* Free the linked list of URL-s. */
free_urlpos (url_list);
FREE_MAYBE (canon_this_url);
/* Decrement the recursion depth. */
--depth;
- if (opt.quota && (opt.downloaded > opt.quota))
+ if (downloaded_exceeds_quota ())
return QUOTEXC;
else
return RETROK;
This is why Wget must, after the whole retrieval, call
convert_all_links to go once more through the entire list of
- retrieved HTML-s, and re-convert them.
+ retrieved HTMLs, and re-convert them.
All the downloaded HTMLs are kept in urls_html, and downloaded URLs
in urls_downloaded. From these two lists information is
else
DEBUGP (("I cannot find the corresponding URL.\n"));
/* Parse the HTML file... */
- urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1);
+ urls = get_urls_html (html->string, urlhtml ? urlhtml->url : NULL, 1,
+ FALSE);
if (!urls)
continue;
for (l1 = urls; l1; l1 = l1->next)
sprintf (version, "Wget/%s", version_string);
}
for (p = version; *p; p++)
- *p = tolower (*p);
+ *p = TOLOWER (*p);
for (p = base_version; *p && *p != '/'; p++)
- *p = tolower (*p);
+ *p = TOLOWER (*p);
*p = '\0';
/* Setting this to 1 means that Wget considers itself under
while ((line = read_whole_line (fp)))
{
len = strlen (line);
- /* Destroy <CR> if there is one. */
+ /* Destroy <CR><LF> if present. */
+ if (len && line[len - 1] == '\n')
+ line[--len] = '\0';
if (len && line[len - 1] == '\r')
- line[len - 1] = '\0';
+ line[--len] = '\0';
/* According to specifications, optional space may be at the
end... */
DEBUGP (("Line: %s\n", line));
int match = 0;
/* Lowercase the agent string. */
for (p = str; *p; p++)
- *p = tolower (*p);
+ *p = TOLOWER (*p);
/* If the string is `*', it matches. */
if (*str == '*' && !*(str + 1))
match = 1;