xfree (qel);
return 1;
}
-
+\f
static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
struct url *, struct hash_table *));
+static int descend_redirect_p PARAMS ((const char *, const char *, int,
+ struct url *, struct hash_table *));
+
/* Retrieve a part of the web beginning with START_URL. This used to
be called "recursive retrieval", because the old function was
/* The queue of URLs we need to load. */
struct url_queue *queue = url_queue_new ();
- /* The URLs we decided we don't want to load. */
+ /* The URLs we do not wish to enqueue, because they are already in
+ the queue, but haven't been downloaded yet. */
struct hash_table *blacklist = make_string_hash_table (0);
/* We'll need various components of this, so better get it over with
status = retrieve_url (url, &file, &redirected, NULL, &dt);
opt.recursive = oldrec;
+ if (file && status == RETROK
+ && (dt & RETROKF) && (dt & TEXTHTML))
+ descend = 1;
+
if (redirected)
{
+ /* We have been redirected, possibly to another host, or
+ different path, or wherever. Check whether we really
+ want to follow it. */
+ if (descend)
+ {
+ if (!descend_redirect_p (redirected, url, depth,
+ start_url_parsed, blacklist))
+ descend = 0;
+ else
+ /* Make sure that the old pre-redirect form gets
+ blacklisted. */
+ string_set_add (blacklist, url);
+ }
+
xfree (url);
url = redirected;
}
- if (file && status == RETROK
- && (dt & RETROKF) && (dt & TEXTHTML))
- descend = 1;
}
if (descend
&& depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
{
- if (opt.page_requisites && depth == opt.reclevel)
- /* When -p is specified, we can do one more partial
- recursion from the "leaf nodes" on the HTML document
- tree. The recursion is partial in that we won't
- traverse any <A> or <AREA> tags, nor any <LINK> tags
- except for <LINK REL="stylesheet">. */
- /* #### This would be the place to implement the TODO
- entry saying that -p should do two more hops on
- framesets. */
- dash_p_leaf_HTML = TRUE;
+ if (opt.page_requisites
+ && (depth == opt.reclevel || depth == opt.reclevel + 1))
+ {
+ /* When -p is specified, we are allowed to exceed the
+ maximum depth, but only for the "inline" links,
+ i.e. those that are needed to display the page.
+ Originally this could exceed the depth at most by
+ one, but we allow one more level so that the leaf
+ pages that contain frames can be loaded
+ correctly. */
+ dash_p_leaf_HTML = TRUE;
+ }
else
{
/* Either -p wasn't specified or it was and we've
- already gone the one extra (pseudo-)level that it
+ already spent the two extra (pseudo-)levels that it
affords us, so we need to bail out. */
DEBUGP (("Not descending further; at depth %d, max. %d.\n",
depth, opt.reclevel));
if (descend)
{
int meta_disallow_follow = 0;
- struct urlpos *children = get_urls_html (file, url, dash_p_leaf_HTML,
- &meta_disallow_follow);
+ struct urlpos *children
+ = get_urls_html (file, url, &meta_disallow_follow);
if (opt.use_robots && meta_disallow_follow)
{
{
if (child->ignore_when_downloading)
continue;
+ if (dash_p_leaf_HTML && !child->link_inline_p)
+ continue;
if (descend_url_p (child, url_parsed, depth, start_url_parsed,
blacklist))
{
opt.delete_after ? "--delete-after" :
"recursive rejection criteria"));
logprintf (LOG_VERBOSE,
- (opt.delete_after ? _("Removing %s.\n")
+ (opt.delete_after
+ ? _("Removing %s.\n")
: _("Removing %s since it should be rejected.\n")),
file);
if (unlink (file))
/* Based on the context provided by retrieve_tree, decide whether a
URL is to be descended to. This is only ever called from
- retrieve_tree, but is in a separate function for clarity. */
+ retrieve_tree, but is in a separate function for clarity.
+
+ The most expensive checks (such as those for robots) are memoized
+ by storing these URLs to BLACKLIST. This may or may not help. It
+ will help if those URLs are encountered many times. */
static int
descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
&& !(u->scheme == SCHEME_FTP && opt.follow_ftp))
{
DEBUGP (("Not following non-HTTP schemes.\n"));
- goto blacklist;
+ goto out;
}
/* 2. If it is an absolute link and they are not followed, throw it
if (opt.relative_only && !upos->link_relative_p)
{
DEBUGP (("It doesn't really look like a relative link.\n"));
- goto blacklist;
+ goto out;
}
/* 3. If its domain is not to be accepted/looked-up, chuck it
if (!accept_domain (u))
{
DEBUGP (("The domain was not accepted.\n"));
- goto blacklist;
+ goto out;
}
/* 4. Check for parent directory.
If we descended to a different host or changed the scheme, ignore
- opt.no_parent. Also ignore it for -p leaf retrievals. */
+ opt.no_parent. Also ignore it for documents needed to display
+ the parent page when in -p mode. */
if (opt.no_parent
- && u->scheme == parent->scheme
- && 0 == strcasecmp (u->host, parent->host)
- && u->port == parent->port)
+ && u->scheme == start_url_parsed->scheme
+ && 0 == strcasecmp (u->host, start_url_parsed->host)
+ && u->port == start_url_parsed->port
+ && !(opt.page_requisites && upos->link_inline_p))
{
- if (!frontcmp (parent->dir, u->dir))
+ if (!frontcmp (start_url_parsed->dir, u->dir))
{
- DEBUGP (("Trying to escape the root directory with no_parent in effect.\n"));
- goto blacklist;
+ DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
+ u->dir, start_url_parsed->dir));
+ goto out;
}
}
if (!accdir (u->dir, ALLABS))
{
DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
- goto blacklist;
+ goto out;
}
}
/* 6. */
{
- char *suf = NULL;
+ char *suf;
/* Check for acceptance/rejection rules. We ignore these rules
for HTML documents because they might lead to other files which
need to be downloaded. Of course, we don't know which
if (u->file[0] != '\0'
&& ((suf = suffix (url)) == NULL
|| (0 != strcmp (suf, "html") && 0 != strcmp (suf, "htm"))
- || (opt.reclevel == INFINITE_RECURSION && depth >= opt.reclevel)))
+ || (opt.reclevel != INFINITE_RECURSION && depth >= opt.reclevel)))
{
if (!acceptable (u->file))
{
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
url, u->file));
- FREE_MAYBE (suf);
- goto blacklist;
+ goto out;
}
}
- FREE_MAYBE (suf);
}
/* 7. */
{
DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
u->host, parent->host));
- goto blacklist;
+ goto out;
}
/* 8. */
if (!res_match_path (specs, u->path))
{
DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
- goto blacklist;
+ string_set_add (blacklist, url);
+ goto out;
}
}
return 1;
- blacklist:
- string_set_add (blacklist, url);
-
out:
DEBUGP (("Decided NOT to load it.\n"));
return 0;
}
+
+/* This function determines whether we should descend the children of
+ the URL whose download resulted in a redirection, possibly to
+ another host, etc. It is needed very rarely, and thus it is merely
+ a simple-minded wrapper around descend_url_p. */
+
+static int
+descend_redirect_p (const char *redirected, const char *original, int depth,
+ struct url *start_url_parsed, struct hash_table *blacklist)
+{
+ struct url *orig_parsed, *new_parsed;
+ struct urlpos *upos;
+ int success;
+
+ orig_parsed = url_parse (original, NULL);
+ assert (orig_parsed != NULL);
+
+ new_parsed = url_parse (redirected, NULL);
+ assert (new_parsed != NULL);
+
+ upos = xmalloc (sizeof (struct urlpos));
+ memset (upos, 0, sizeof (*upos));
+ upos->url = new_parsed;
+
+ success = descend_url_p (upos, orig_parsed, depth,
+ start_url_parsed, blacklist);
+
+ url_free (orig_parsed);
+ url_free (new_parsed);
+ xfree (upos);
+
+ if (!success)
+ DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
+
+ return success;
+}
+
\f
/* Register that URL has been successfully downloaded to FILE. */
downloaded_html_files = slist_prepend (downloaded_html_files, file);
}
-/* convert_links() is called from recursive_retrieve() after we're
- done with an HTML file. This call to convert_links is not complete
- because it converts only the downloaded files, and Wget cannot know
- which files will be downloaded afterwards. So, if we have file
- fileone.html with:
-
- <a href="/c/something.gif">
-
- and /c/something.gif was not downloaded because it exceeded the
- recursion depth, the reference will *not* be changed.
-
- However, later we can encounter /c/something.gif from an "upper"
- level HTML (let's call it filetwo.html), and it gets downloaded.
+/* This function is called when the retrieval is done to convert the
+ links that have been downloaded. It has to be called at the end of
+ the retrieval, because only then does Wget know conclusively which
+ URLs have been downloaded, and which not, so it can tell which
+ direction to convert to.
- But now we have a problem because /c/something.gif will be
- correctly transformed in filetwo.html, but not in fileone.html,
- since Wget could not have known that /c/something.gif will be
- downloaded in the future.
+ The "direction" means that the URLs to the files that have been
+ downloaded get converted to the relative URL which will point to
+ that file. And the other URLs get converted to the remote URL on
+ the server.
- This is why Wget must, after the whole retrieval, call
- convert_all_links to go once more through the entire list of
- retrieved HTMLs, and re-convert them.
+ All the downloaded HTMLs are kept in downloaded_html_files, and
+ downloaded URLs in urls_downloaded. All the information is
+ extracted from these two lists. */
- All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
- in urls_downloaded. From these two lists information is
- extracted. */
void
convert_all_links (void)
{
slist *html;
+ struct wget_timer *timer;
+ long msecs;
+ int file_count = 0;
+
+ timer = wtimer_new ();
/* Destructively reverse downloaded_html_files to get it in the right order.
recursive_retrieve() used slist_prepend() consistently. */
DEBUGP (("I cannot find the corresponding URL.\n"));
/* Parse the HTML file... */
- urls = get_urls_html (html->string, url, FALSE, NULL);
+ urls = get_urls_html (html->string, url, NULL);
/* We don't respect meta_disallow_follow here because, even if
the file is not followed, we might still want to convert the
cur_url->local_name = NULL;
}
}
+
/* Convert the links in the file. */
convert_links (html->string, urls);
+ ++file_count;
+
/* Free the data. */
free_urlpos (urls);
}
+
+ msecs = wtimer_elapsed (timer);
+ wtimer_delete (timer);
+ logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),
+ file_count, (double)msecs / 1000);
}
/* Cleanup the data structures associated with recursive retrieving