/* Handling of recursive HTTP retrieving.
- Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
+ Copyright (C) 1996-2006 Free Software Foundation, Inc.
This file is part of GNU Wget.
GNU Wget is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
GNU Wget is distributed in the hope that it will be useful,
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+along with Wget. If not, see <http://www.gnu.org/licenses/>.
In addition, as a special exception, the Free Software Foundation
gives permission to link the code of its release of Wget with the
#include "hash.h"
#include "res.h"
#include "convert.h"
+#include "spider.h"
\f
/* Functions for maintaining the URL queue. */
const char *url; /* the URL to download */
const char *referer; /* the referring document */
int depth; /* the depth */
- unsigned int html_allowed :1; /* whether the document is allowed to
+ bool html_allowed; /* whether the document is allowed to
be treated as HTML. */
struct queue_element *next; /* next element in queue */
{
int dt = 0;
char *redirected = NULL;
- bool oldrec = opt.recursive;
- opt.recursive = false;
- status = retrieve_url (url, &file, &redirected, referer, &dt);
- opt.recursive = oldrec;
+ status = retrieve_url (url, &file, &redirected, referer, &dt, false);
if (html_allowed && file && status == RETROK
&& (dt & RETROKF) && (dt & TEXTHTML))
}
}
+ if (opt.spider)
+ {
+ visited_url (url, referer);
+ }
+
if (descend
&& depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
{
}
}
- if (opt.delete_after || (file && !acceptable (file)))
+ if (file
+ && (opt.delete_after
+ || opt.spider /* opt.recursive is implicitely true */
+ || !acceptable (file)))
{
/* Either --delete-after was specified, or we loaded this
- otherwise rejected (e.g. by -R) HTML file just so we
- could harvest its hyperlinks -- in either case, delete
- the local file. */
+ (otherwise unneeded because of --spider or rejected by -R)
+ HTML file just to harvest its hyperlinks -- in either case,
+ delete the local file. */
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
opt.delete_after ? "--delete-after" :
- "recursive rejection criteria"));
+ (opt.spider ? "--spider" :
+ "recursive rejection criteria")));
logprintf (LOG_VERBOSE,
- (opt.delete_after
+ (opt.delete_after || opt.spider
? _("Removing %s.\n")
: _("Removing %s since it should be rejected.\n")),
file);
if (unlink (file))
logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
+ logputs (LOG_VERBOSE, "\n");
register_delete_file (file);
}
if (string_set_contains (blacklist, url))
{
+ if (opt.spider)
+ {
+ char *referrer = url_string (parent, true);
+ DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
+ visited_url (url, referrer);
+ xfree (referrer);
+ }
DEBUGP (("Already on the black list.\n"));
goto out;
}
&& u->port == start_url_parsed->port
&& !(opt.page_requisites && upos->link_inline_p))
{
- if (!frontcmp (start_url_parsed->dir, u->dir))
+ if (!subdir_p (start_url_parsed->dir, u->dir))
{
DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
u->dir, start_url_parsed->dir));
exclusion and inclusion lists. */
if (opt.includes || opt.excludes)
{
- if (!accdir (u->dir, ALLABS))
+ if (!accdir (u->dir))
{
DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
goto out;