[svn] Merge of fix for bugs 20341 and 20410.

[wget] / src / recur.c
diff --git a/src/recur.c b/src/recur.c

index 47f96a10a5b818d096453889507465c39939bbff..53cd39cdc618311ecba790588478d1e3b4963fb7 100644 (file)
--- a/src/recur.c
+++ b/src/recur.c
@@ -1,11 +1,11 @@
  /* Handling of recursive HTTP retrieving.
-   Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996-2006 Free Software Foundation, Inc.
  
  This file is part of GNU Wget.
  
  GNU Wget is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
+the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.
  
  GNU Wget is distributed in the hope that it will be useful,
@@ -14,8 +14,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
-along with Wget; if not, write to the Free Software
-Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+along with Wget.  If not, see <http://www.gnu.org/licenses/>.
  
  In addition, as a special exception, the Free Software Foundation
  gives permission to link the code of its release of Wget with the
@@ -48,6 +47,7 @@ so, delete this exception statement from your version.  */
  #include "hash.h"
  #include "res.h"
  #include "convert.h"
+#include "spider.h"
  \f
  /* Functions for maintaining the URL queue.  */
  
@@ -55,7 +55,7 @@ struct queue_element {
    const char *url;             /* the URL to download */
    const char *referer;         /* the referring document */
    int depth;                   /* the depth */
-  unsigned int html_allowed :1;        /* whether the document is allowed to
+  bool html_allowed;           /* whether the document is allowed to
                                    be treated as HTML. */
  
    struct queue_element *next;  /* next element in queue */
@@ -246,11 +246,8 @@ retrieve_tree (const char *start_url)
         {
           int dt = 0;
           char *redirected = NULL;
-         bool oldrec = opt.recursive;
  
-         opt.recursive = false;
-         status = retrieve_url (url, &file, &redirected, referer, &dt);
-         opt.recursive = oldrec;
+         status = retrieve_url (url, &file, &redirected, referer, &dt, false);
  
           if (html_allowed && file && status == RETROK
               && (dt & RETROKF) && (dt & TEXTHTML))
@@ -277,6 +274,11 @@ retrieve_tree (const char *start_url)
             }
         }
  
+      if (opt.spider)
+       {
+          visited_url (url, referer);
+       }
+
        if (descend
           && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
         {
@@ -348,22 +350,27 @@ retrieve_tree (const char *start_url)
             }
         }
  
-      if (opt.delete_after || (file && !acceptable (file)))
+      if (file 
+          && (opt.delete_after 
+              || opt.spider /* opt.recursive is implicitely true */
+              || !acceptable (file)))
         {
           /* Either --delete-after was specified, or we loaded this
-            otherwise rejected (e.g. by -R) HTML file just so we
-            could harvest its hyperlinks -- in either case, delete
-            the local file. */
+            (otherwise unneeded because of --spider or rejected by -R) 
+            HTML file just to harvest its hyperlinks -- in either case, 
+            delete the local file. */
           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
                    opt.delete_after ? "--delete-after" :
-                  "recursive rejection criteria"));
+                  (opt.spider ? "--spider" : 
+                   "recursive rejection criteria")));
           logprintf (LOG_VERBOSE,
-                    (opt.delete_after
+                    (opt.delete_after || opt.spider
                       ? _("Removing %s.\n")
                       : _("Removing %s since it should be rejected.\n")),
                      file);
           if (unlink (file))
             logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
+         logputs (LOG_VERBOSE, "\n");
           register_delete_file (file);
         }
  
@@ -419,6 +426,13 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
  
    if (string_set_contains (blacklist, url))
      {
+      if (opt.spider) 
+       {
+          char *referrer = url_string (parent, true);
+          DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
+          visited_url (url, referrer);
+         xfree (referrer);
+       }
        DEBUGP (("Already on the black list.\n"));
        goto out;
      }
@@ -482,7 +496,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
        && u->port == start_url_parsed->port
        && !(opt.page_requisites && upos->link_inline_p))
      {
-      if (!frontcmp (start_url_parsed->dir, u->dir))
+      if (!subdir_p (start_url_parsed->dir, u->dir))
         {
           DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
                    u->dir, start_url_parsed->dir));
@@ -495,7 +509,7 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
       exclusion and inclusion lists.  */
    if (opt.includes || opt.excludes)
      {
-      if (!accdir (u->dir, ALLABS))
+      if (!accdir (u->dir))
         {
           DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
           goto out;