From 0dbef4ccb4f359c1bb9ad81120618d8ab436d9b0 Mon Sep 17 00:00:00 2001
From: mtortonesi <devnull@localhost>
Date: Thu, 24 Aug 2006 08:27:57 -0700
Subject: [PATCH] [svn] Several fixes for recursive spider mode.

---
 src/ChangeLog   |  24 +++++++
 src/Makefile.in |  16 ++---
 src/convert.c   | 120 ---------------------------------
 src/convert.h   |   3 -
 src/http.c      |  73 ++++++++++++++------
 src/recur.c     |  13 ++++
 src/res.c       |   8 ++-
 src/spider.c    | 175 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/spider.h    |  37 ++++++++++
 9 files changed, 317 insertions(+), 152 deletions(-)
 create mode 100644 src/spider.c
 create mode 100644 src/spider.h

diff --git a/src/ChangeLog b/src/ChangeLog
index f21afddf..4bcb98cc 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,27 @@
+2006-08-24  Mauro Tortonesi  <mauro@ferrara.linux.it>
+
+	* Makefile.in: Added spider.c to the list of files to compile and
+	spider.h to the list of header files. Updated copyright information.
+
+	* http.c: Major changes to recursive spider mode. Now for every
+	resource we are supposed to check, we send a HEAD request to find out
+	if it exists. If the resource is a HTML file, we retrieve it and parse
+	it to discover links to other resources.
+
+	* recur.c: Ditto.
+
+	* res.c (res_retrieve_file): Disable opt.timestamping and opt.spider
+	when retrieving robots.txt. Updated copyright information.
+
+	* convert.c: Moved code tracking broken links to spider.c.
+
+	* convert.h: Ditto.
+
+	* spider.c: Created new file to keep track of visited URLs in spider
+	mode.
+
+	* spider.h: Ditto.
+
 2006-08-21  Mauro Tortonesi  <mauro@ferrara.linux.it>
 
 	* http.c: Fixed timestamping-related bug.
diff --git a/src/Makefile.in b/src/Makefile.in
index e0313220..bcacd7dd 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -1,5 +1,5 @@
 # Makefile for `wget' utility
-# Copyright (C) 1995-2005 Free Software Foundation, Inc.
+# Copyright (C) 1995-2006 Free Software Foundation, Inc.
 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -76,8 +76,8 @@ OBJ = $(ALLOCA) cmpt.o connect.o convert.o cookies.o              \
       ftp.o ftp-basic.o ftp-ls.o $(OPIE_OBJ) $(GETOPT_OBJ) hash.o \
       host.o html-parse.o html-url.o http.o $(NTLM_OBJ) init.o    \
       log.o main.o $(MD5_OBJ) netrc.o progress.o ptimer.o recur.o \
-      res.o retr.o safe-ctype.o snprintf.o $(SSL_OBJ) url.o       \
-      utils.o version.o xmalloc.o
+      res.o retr.o safe-ctype.o snprintf.o spider.o $(SSL_OBJ)    \
+      url.o utils.o version.o xmalloc.o
 
 .SUFFIXES:
 .SUFFIXES: .c .o
@@ -96,11 +96,11 @@ wget$(exeext): $(OBJ)
 # time, and it's a lot safer than attempting to get all the
 # dependencies right.
 
-$(OBJ): config-post.h config.h connect.h convert.h cookies.h ftp.h   \
-        gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h      \
-        http-ntlm.h init.h log.h mswindows.h netrc.h options.h       \
-        progress.h ptimer.h recur.h res.h retr.h safe-ctype.h ssl.h  \
-        sysdep.h url.h utils.h wget.h xmalloc.h
+$(OBJ): config-post.h config.h connect.h convert.h cookies.h ftp.h \
+        gen-md5.h getopt.h gnu-md5.h hash.h host.h html-parse.h    \
+        http-ntlm.h init.h log.h mswindows.h netrc.h options.h     \
+        progress.h ptimer.h recur.h res.h retr.h safe-ctype.h      \
+        spider.h ssl.h sysdep.h url.h utils.h wget.h xmalloc.h
 
 #
 # Dependencies for installing
diff --git a/src/convert.c b/src/convert.c
index 7def7c89..a59e3c51 100644
--- a/src/convert.c
+++ b/src/convert.c
@@ -54,8 +54,6 @@ struct hash_table *dl_url_file_map;
    conversion after Wget is done.  */
 struct hash_table *downloaded_html_set;
 
-static struct hash_table *nonexisting_urls_hash;
-
 static void convert_links (const char *, struct urlpos *);
 
 /* This function is called when the retrieval is done to convert the
@@ -835,7 +833,6 @@ register_html (const char *url, const char *file)
 }
 
 static void downloaded_files_free (void);
-static void nonexisting_urls_free (void);
 
 /* Cleanup the data structures associated with this file.  */
 
@@ -857,7 +854,6 @@ convert_cleanup (void)
   if (downloaded_html_set)
     string_set_free (downloaded_html_set);
   downloaded_files_free ();
-  nonexisting_urls_free ();
   if (converted_files)
     string_set_free (converted_files);
 }
@@ -957,122 +953,6 @@ downloaded_files_free (void)
       downloaded_files_hash = NULL;
     }
 }
-
-/* Remembers broken links.  */
-
-struct broken_urls_list 
-{
-  char *url;
-  struct broken_urls_list *next;
-};
-
-static bool
-in_list (const struct broken_urls_list *list, const char *url)
-{
-  const struct broken_urls_list *ptr;
-  
-  for (ptr = list; ptr; ptr = ptr->next)
-    {
-      /* str[case]cmp is inadequate for URL comparison */
-      if (are_urls_equal (url, ptr->url) == 0) return true;      
-    }
-  
-  return false;
-}
-
-void
-nonexisting_url (const char *url, const char *referrer)
-{
-  struct broken_urls_list *list;
- 
-  /* Ignore robots.txt URLs */
-  if (is_robots_txt_url (url))
-    return;
-  
-  if (!nonexisting_urls_hash)
-    nonexisting_urls_hash = make_string_hash_table (0);
-
-  list = hash_table_get (nonexisting_urls_hash, url);
-  if (!list)
-    {
-      list = (struct broken_urls_list *) xnew0 (struct broken_urls_list);
-      list->url = referrer ? xstrdup (referrer) : NULL;
-      hash_table_put (nonexisting_urls_hash, xstrdup (url), list);
-    }
-  else if (list && !in_list (list, referrer)) 
-    {
-      /* Append referrer at the end of the list */
-      struct broken_urls_list *newnode;
-      
-      while (list->next) list = list->next;
-      
-      newnode = xnew0 (struct broken_urls_list);
-      newnode->url = xstrdup (referrer);
-      list->next = newnode;
-    }
-}
-
-static void
-nonexisting_urls_free (void)
-{
-  if (nonexisting_urls_hash)
-    {
-      hash_table_iterator iter;
-      for (hash_table_iterate (nonexisting_urls_hash, &iter);
-           hash_table_iter_next (&iter);
-           )
-        {
-          xfree (iter.key);
-          xfree (iter.value);
-        }
-      hash_table_destroy (nonexisting_urls_hash);
-      nonexisting_urls_hash = NULL;
-    }
-}
-
-void
-print_broken_links (void)
-{
-  hash_table_iterator iter;
-  int num_elems;
-  
-  if (!nonexisting_urls_hash) 
-    {
-      logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
-      return;
-    }
-  
-  num_elems = hash_table_count (nonexisting_urls_hash);
-  assert (num_elems > 0);
-  
-  if (num_elems > 1) 
-    {
-      logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"), 
-                 num_elems);
-    }
-  else
-    {
-      logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
-    }
-  
-  for (hash_table_iterate (nonexisting_urls_hash, &iter);
-       hash_table_iter_next (&iter);
-       )
-    {
-      struct broken_urls_list *list;
-          
-      logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
-
-      for (list = (struct broken_urls_list *) iter.value; 
-           list; 
-           list = list->next) 
-        {
-          logprintf (LOG_NOTQUIET, _("    %s\n"), list->url);
-        }
-    }
-  logputs (LOG_NOTQUIET, "\n");
-}
-
 
 /* The function returns the pointer to the malloc-ed quoted version of
    string s.  It will recognize and quote numeric and special graphic
diff --git a/src/convert.h b/src/convert.h
index 6104b393..11d6a5f1 100644
--- a/src/convert.h
+++ b/src/convert.h
@@ -104,7 +104,4 @@ void convert_cleanup (void);
 
 char *html_quote_string (const char *);
 
-void nonexisting_url (const char *, const char *);
-void print_broken_links (void);
-
 #endif /* CONVERT_H */
diff --git a/src/http.c b/src/http.c
index 5da467cf..a6fb4d1d 100644
--- a/src/http.c
+++ b/src/http.c
@@ -2281,6 +2281,10 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
       /* Get the current time string.  */
       tms = time_str (time (NULL));
       
+      if (opt.spider && !got_head)
+        logprintf (LOG_VERBOSE, _("\
+Spider mode enabled. Check if remote file exists.\n"));
+
       /* Print fetch message, if opt.verbose.  */
       if (opt.verbose)
         {
@@ -2308,8 +2312,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
       /* Default document type is empty.  However, if spider mode is
          on or time-stamping is employed, HEAD_ONLY commands is
          encoded within *dt.  */
-      if ((opt.spider && !opt.recursive) 
-          || (opt.timestamping  && !got_head)
+      if (((opt.spider || opt.timestamping) && !got_head)
           || (opt.always_rest && !got_name))
         *dt |= HEAD_ONLY;
       else
@@ -2412,13 +2415,22 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
               hurl = url_string (u, true);
               logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
             }
-          if (opt.spider && opt.recursive)
+          /* Maybe we should always keep track of broken links, not just in
+           * spider mode.  */
+          if (opt.spider)
+            {
+              /* #### Again: ugly ugly ugly! */
+              if (!hurl) 
+                hurl = url_string (u, true);
+              nonexisting_url (hurl);
+              logprintf (LOG_NOTQUIET, _("\
+Remote file does not exist -- broken link!!!\n"));
+            }
+          else
             {
-              if (!hurl) hurl = url_string (u, true);
-              nonexisting_url (hurl, referer);
+              logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
+                         tms, hstat.statcode, escnonprint (hstat.error));
             }
-          logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
-                     tms, hstat.statcode, escnonprint (hstat.error));
           logputs (LOG_VERBOSE, "\n");
           ret = WRONGCODE;
           xfree_null (hurl);
@@ -2447,10 +2459,12 @@ Last-modified header invalid -- time-stamp ignored.\n"));
           /* The time-stamping section.  */
           if (opt.timestamping)
             {
-              if (hstat.orig_file_name) /* Perform this check only if the file we're 
-                                           supposed to download already exists. */
+              if (hstat.orig_file_name) /* Perform the following checks only 
+                                           if the file we're supposed to 
+                                           download already exists. */
                 {
-                  if (hstat.remote_time && tmr != (time_t) (-1))
+                  if (hstat.remote_time && 
+                      tmr != (time_t) (-1))
                     {
                       /* Now time-stamping can be used validly.  Time-stamping
                          means that if the sizes of the local and remote file
@@ -2459,7 +2473,8 @@ Last-modified header invalid -- time-stamp ignored.\n"));
                          download procedure is resumed.  */
                       if (hstat.orig_file_tstamp >= tmr)
                         {
-                          if (hstat.contlen == -1 || hstat.orig_file_size == hstat.contlen)
+                          if (hstat.contlen == -1 
+                              || hstat.orig_file_size == hstat.contlen)
                             {
                               logprintf (LOG_VERBOSE, _("\
 Server file no newer than local file `%s' -- not retrieving.\n\n"),
@@ -2492,6 +2507,33 @@ The sizes do not match (local %s) -- retrieving.\n"),
               got_name = true;
               restart_loop = true;
             }
+          
+          if (opt.spider)
+            {
+              if (opt.recursive)
+                {
+                  if (*dt & TEXTHTML)
+                    {
+                      logputs (LOG_VERBOSE, _("\
+Remote file exists and could contain links to other resources -- retrieving.\n\n"));
+                      restart_loop = true;
+                    }
+                  else 
+                    {
+                      logprintf (LOG_VERBOSE, _("\
+Remote file exists but does not contain any link -- not retrieving.\n\n"));
+                      ret = RETRUNNEEDED;
+                      goto exit;
+                    }
+                }
+              else
+                {
+                  logprintf (LOG_VERBOSE, _("\
+Remote file exists but recursion is disabled -- not retrieving.\n\n"));
+                  ret = RETRUNNEEDED;
+                  goto exit;
+                }
+            }
 
           got_head = true;    /* no more time-stamping */
           *dt &= ~HEAD_ONLY;
@@ -2502,7 +2544,6 @@ The sizes do not match (local %s) -- retrieving.\n"),
         }
           
       if ((tmr != (time_t) (-1))
-          && (!opt.spider || opt.recursive)
           && ((hstat.len == hstat.contlen) ||
               ((hstat.res == 0) && (hstat.contlen == -1))))
         {
@@ -2521,14 +2562,6 @@ The sizes do not match (local %s) -- retrieving.\n"),
         }
       /* End of time-stamping section. */
 
-      if (opt.spider && !opt.recursive)
-        {
-          logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode,
-                     escnonprint (hstat.error));
-          ret = RETROK;
-          goto exit;
-        }
-
       tmrate = retr_rate (hstat.rd_size, hstat.dltime);
       total_download_time += hstat.dltime;
 
diff --git a/src/recur.c b/src/recur.c
index b746332b..33e32bec 100644
--- a/src/recur.c
+++ b/src/recur.c
@@ -274,6 +274,11 @@ retrieve_tree (const char *start_url)
 	    }
 	}
 
+      if (opt.spider)
+      	{
+          visited_url (url, referer);
+	}
+
       if (descend
 	  && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
 	{
@@ -365,6 +370,7 @@ retrieve_tree (const char *start_url)
 		     file);
 	  if (unlink (file))
 	    logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
+	  logputs (LOG_VERBOSE, "\n");
 	  register_delete_file (file);
 	}
 
@@ -420,6 +426,13 @@ download_child_p (const struct urlpos *upos, struct url *parent, int depth,
 
   if (string_set_contains (blacklist, url))
     {
+      if (opt.spider) 
+	{
+          char *referrer = url_string (parent, true);
+          DEBUGP (("download_child_p: parent->url is: `%s'\n", parent->url));
+          visited_url (url, referrer);
+	  xfree (referrer);
+	}
       DEBUGP (("Already on the black list.\n"));
       goto out;
     }
diff --git a/src/res.c b/src/res.c
index 103bc4e7..a160591f 100644
--- a/src/res.c
+++ b/src/res.c
@@ -1,5 +1,5 @@
 /* Support for Robot Exclusion Standard (RES).
-   Copyright (C) 2001 Free Software Foundation, Inc.
+   Copyright (C) 2001,2006 Free Software Foundation, Inc.
 
 This file is part of Wget.
 
@@ -539,10 +539,16 @@ res_retrieve_file (const char *url, char **file)
 {
   uerr_t err;
   char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
+  int saved_ts_val = opt.timestamping;
+  int saved_sp_val = opt.spider;
 
   logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
   *file = NULL;
+  opt.timestamping = false;
+  opt.spider       = false;
   err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
+  opt.timestamping = saved_ts_val;
+  opt.spider       = saved_sp_val;  
   xfree (robots_url);
 
   if (err != RETROK && *file != NULL)
diff --git a/src/spider.c b/src/spider.c
new file mode 100644
index 00000000..d8cf8361
--- /dev/null
+++ b/src/spider.c
@@ -0,0 +1,175 @@
+/* Keep track of visited URLs in spider mode.
+   Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "wget.h"
+#include "spider.h"
+#include "url.h"
+#include "utils.h"
+#include "hash.h"
+#include "res.h"
+
+
+static struct hash_table *visited_urls_hash;
+static struct hash_table *nonexisting_urls_set;
+
+/* Cleanup the data structures associated with this file.  */
+
+void
+spider_cleanup (void)
+{
+  if (visited_urls_hash)
+    {
+      free_keys_and_values (visited_urls_hash);
+      hash_table_destroy (visited_urls_hash);
+      visited_urls_hash = NULL;
+    }
+  if (nonexisting_urls_set)
+    string_set_free (nonexisting_urls_set);
+}
+
+/* Remembers visited files.  */
+
+struct url_list 
+{
+  char *url;
+  struct url_list *next;
+};
+
+static bool
+in_url_list_p (const struct url_list *list, const char *url, bool verbose)
+{
+  const struct url_list *ptr;
+  
+  for (ptr = list; ptr; ptr = ptr->next)
+    {
+      /* str[case]cmp is inadequate for URL comparison */
+      if (are_urls_equal (url, ptr->url)) 
+        return true;
+    }
+ 
+  return false;
+}
+
+void
+visited_url (const char *url, const char *referrer)
+{
+  struct url_list *list;
+
+  /* Ignore robots.txt URLs */
+  if (is_robots_txt_url (url))
+    return;
+  
+  if (!visited_urls_hash)
+    visited_urls_hash = make_string_hash_table (0);
+
+  list = hash_table_get (visited_urls_hash, url);
+  if (!list)
+    {
+      list = (struct url_list *) xnew0 (struct url_list);
+      list->url = referrer ? xstrdup (referrer) : NULL;
+      hash_table_put (visited_urls_hash, xstrdup (url), list);
+    }
+  else if (referrer && !in_url_list_p (list, referrer, false)) 
+    {
+      /* Append referrer at the end of the list */
+      struct url_list *newnode;
+      
+      while (list->next) 
+        list = list->next;
+      
+      newnode = (struct url_list *) xnew0 (struct url_list);
+      newnode->url = xstrdup (referrer);
+      list->next = newnode;
+    }
+}
+
+/* Remembers broken links.  */
+void
+nonexisting_url (const char *url)
+{
+  /* Ignore robots.txt URLs */
+  if (is_robots_txt_url (url))
+    return;
+  if (!nonexisting_urls_set)
+    nonexisting_urls_set = make_string_hash_table (0);
+  string_set_add (nonexisting_urls_set, url);
+}
+
+void
+print_broken_links (void)
+{
+  hash_table_iterator iter;
+  int num_elems;
+  
+  if (!nonexisting_urls_set) 
+    {
+      logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
+      return;
+    }
+  
+  num_elems = hash_table_count (nonexisting_urls_set);
+  assert (num_elems > 0);
+  
+  if (num_elems > 1) 
+    {
+      logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"), 
+                 num_elems);
+    }
+  else
+    {
+      logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
+    }
+  
+  for (hash_table_iterate (nonexisting_urls_set, &iter);
+       hash_table_iter_next (&iter); )
+    {
+      struct url_list *list;
+      const char *url = (const char *) iter.key;
+          
+      logprintf (LOG_NOTQUIET, _("%s referred by:\n"), url);
+      
+      for (list = (struct url_list *) hash_table_get (visited_urls_hash, url); 
+           list; list = list->next) 
+        {
+          logprintf (LOG_NOTQUIET, _("    %s\n"), list->url);
+        }
+    }
+  logputs (LOG_NOTQUIET, "\n");
+}
+
+/*
+ * vim: et ts=2 sw=2
+ */
+
diff --git a/src/spider.h b/src/spider.h
new file mode 100644
index 00000000..9cf71e80
--- /dev/null
+++ b/src/spider.h
@@ -0,0 +1,37 @@
+/* Declarations for spider.c
+   Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+
+In addition, as a special exception, the Free Software Foundation
+gives permission to link the code of its release of Wget with the
+OpenSSL project's "OpenSSL" library (or with modified versions of it
+that use the same license as the "OpenSSL" library), and distribute
+the linked executables.  You must obey the GNU General Public License
+in all respects for all of the code used other than "OpenSSL".  If you
+modify this file, you may extend this exception to your version of the
+file, but you are not obligated to do so.  If you do not wish to do
+so, delete this exception statement from your version.  */
+
+#ifndef SPIDER_H
+#define SPIDER_H
+
+void visited_url (const char *, const char *);
+void nonexisting_url (const char *);
+void print_broken_links (void);
+
+#endif /* SPIDER_H */
-- 
2.39.2