X-Git-Url: http://sjero.net/git/?p=wget;a=blobdiff_plain;f=src%2Frecur.c;h=53cd39cdc618311ecba790588478d1e3b4963fb7;hp=1e277ca37dfa75663a8902926772dc2b5e67e35b;hb=4d7c5e087b2bc82c9f503dff003916d1047903ce;hpb=ea4ffded27decc9f4baf8ab10c09bc4e7b5834f5 diff --git a/src/recur.c b/src/recur.c index 1e277ca3..53cd39cd 100644 --- a/src/recur.c +++ b/src/recur.c @@ -1,11 +1,11 @@ /* Handling of recursive HTTP retrieving. - Copyright (C) 1996-2005 Free Software Foundation, Inc. + Copyright (C) 1996-2006 Free Software Foundation, Inc. This file is part of GNU Wget. GNU Wget is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or +the Free Software Foundation; either version 3 of the License, or (at your option) any later version. GNU Wget is distributed in the hope that it will be useful, @@ -14,8 +14,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License -along with Wget; if not, write to the Free Software Foundation, Inc., -51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +along with Wget. If not, see . In addition, as a special exception, the Free Software Foundation gives permission to link the code of its release of Wget with the @@ -48,6 +47,7 @@ so, delete this exception statement from your version. */ #include "hash.h" #include "res.h" #include "convert.h" +#include "spider.h" /* Functions for maintaining the URL queue. */ @@ -246,11 +246,8 @@ retrieve_tree (const char *start_url) { int dt = 0; char *redirected = NULL; - bool oldrec = opt.recursive; - opt.recursive = false; - status = retrieve_url (url, &file, &redirected, referer, &dt); - opt.recursive = oldrec; + status = retrieve_url (url, &file, &redirected, referer, &dt, false); if (html_allowed && file && status == RETROK && (dt & RETROKF) && (dt & TEXTHTML)) @@ -277,6 +274,11 @@ retrieve_tree (const char *start_url) } } + if (opt.spider) + { + visited_url (url, referer); + } + if (descend && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) { @@ -348,22 +350,27 @@ retrieve_tree (const char *start_url) } } - if (opt.delete_after || (file && !acceptable (file))) + if (file + && (opt.delete_after + || opt.spider /* opt.recursive is implicitely true */ + || !acceptable (file))) { /* Either --delete-after was specified, or we loaded this - otherwise rejected (e.g. by -R) HTML file just so we - could harvest its hyperlinks -- in either case, delete - the local file. */ + (otherwise unneeded because of --spider or rejected by -R) + HTML file just to harvest its hyperlinks -- in either case, + delete the local file. */ DEBUGP (("Removing file due to %s in recursive_retrieve():\n", opt.delete_after ? "--delete-after" : - "recursive rejection criteria")); + (opt.spider ? "--spider" : + "recursive rejection criteria"))); logprintf (LOG_VERBOSE, - (opt.delete_after + (opt.delete_after || opt.spider ? _("Removing %s.\n") : _("Removing %s since it should be rejected.\n")), file); if (unlink (file)) logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); + logputs (LOG_VERBOSE, "\n"); register_delete_file (file); } @@ -419,6 +426,13 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth, if (string_set_contains (blacklist, url)) { + if (opt.spider) + { + char *referrer = url_string (parent, true); + DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url)); + visited_url (url, referrer); + xfree (referrer); + } DEBUGP (("Already on the black list.\n")); goto out; }