[svn] Fixed recursive spider mode.

author mtortonesi <devnull@localhost>

Thu, 25 May 2006 16:11:29 +0000 (09:11 -0700)

committer mtortonesi <devnull@localhost>

Thu, 25 May 2006 16:11:29 +0000 (09:11 -0700)
author mtortonesi <devnull@localhost>
Thu, 25 May 2006 16:11:29 +0000 (09:11 -0700)
committer mtortonesi <devnull@localhost>
Thu, 25 May 2006 16:11:29 +0000 (09:11 -0700)
diff --git a/src/ChangeLog b/src/ChangeLog

index b8e207a7c59ad2e5dd923513631ad1190404b5bb..9b9ffcf4b4e5c9fb07fa89864856eb43a198420d 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,27 @@
+2006-05-25  Mauro Tortonesi  <mauro@ferrara.linux.it>
+
+       * convert.c: Added mechanisms to keep track broken links.
+
+       * convert.h: Ditto.
+
+       * wget.h: Reordered and enumerated uerr_t constants.
+
+       * recur.c: Fixes to support recursive spider mode.
+
+       * http.c: Ditto.
+
+       * main.c: Print broken links in case of recursive spider mode.
+
+       * retr.c: Changed interface of retrieve_url.
+
+       * retr.h: Ditto.
+
+       * ftp.c: Changed interface of ftp_loop.
+
+       * ftp.h: Ditto.
+
+       * res.c: Minor change to reflect changes in interface of retrieve_url.
+
  2006-05-18  Lawrence Jones  <lawrence.jones@ugs.com>
  
         * ftp-ls.c (ftp_parse_unix_ls): Correct size parsing, add size
diff --git a/src/convert.c b/src/convert.c

index a460a4b4fb01c9205382fd548307b8979584af6e..9813373347f7e6292d379620c16fc567d785a090 100644 (file)
--- a/src/convert.c
+++ b/src/convert.c
@@ -53,6 +53,8 @@ struct hash_table *dl_url_file_map;
     conversion after Wget is done.  */
  struct hash_table *downloaded_html_set;
  
+static struct hash_table *nonexisting_urls_hash;
+
  static void convert_links (const char *, struct urlpos *);
  
  /* This function is called when the retrieval is done to convert the
@@ -832,6 +834,7 @@ register_html (const char *url, const char *file)
  }
  
  static void downloaded_files_free (void);
+static void nonexisting_urls_free (void);
  
  /* Cleanup the data structures associated with this file.  */
  
@@ -853,6 +856,7 @@ convert_cleanup (void)
    if (downloaded_html_set)
      string_set_free (downloaded_html_set);
    downloaded_files_free ();
+  nonexisting_urls_free ();
    if (converted_files)
      string_set_free (converted_files);
  }
@@ -952,6 +956,118 @@ downloaded_files_free (void)
        downloaded_files_hash = NULL;
      }
  }
+\f
+/* Remembers broken links.  */
+
+struct broken_urls_list 
+{
+  char *url;
+  struct broken_urls_list *next;
+};
+
+static bool
+in_list (const struct broken_urls_list *list, const char *url)
+{
+  const struct broken_urls_list *ptr;
+  
+  for (ptr = list; ptr; ptr = ptr->next)
+    {
+      /* TODO: strcasecmp may not be appropriate to compare URLs */
+      if (strcasecmp (url, ptr->url) == 0) return true;      
+    }
+  
+  return false;
+}
+
+void
+nonexisting_url (const char *url, const char *referrer)
+{
+  struct broken_urls_list *list;
+  
+  if (!nonexisting_urls_hash)
+    nonexisting_urls_hash = make_string_hash_table (0);
+
+  list = hash_table_get (nonexisting_urls_hash, url);
+  if (!list)
+    {
+      list = (struct broken_urls_list *) xnew0 (struct broken_urls_list);
+      list->url = referrer ? xstrdup (referrer) : NULL;
+      hash_table_put (nonexisting_urls_hash, xstrdup (url), list);
+    }
+  else if (list && !in_list (list, referrer)) 
+    {
+      /* Append referrer at the end of the list */
+      struct broken_urls_list *newnode;
+      
+      while (list->next) list = list->next;
+      
+      newnode = xnew0 (struct broken_urls_list);
+      newnode->url = xstrdup (referrer);
+      list->next = newnode;
+    }
+}
+
+static void
+nonexisting_urls_free (void)
+{
+  if (nonexisting_urls_hash)
+    {
+      hash_table_iterator iter;
+      for (hash_table_iterate (nonexisting_urls_hash, &iter);
+          hash_table_iter_next (&iter);
+          )
+        {
+         xfree (iter.key);
+         xfree (iter.value);
+       }
+      hash_table_destroy (nonexisting_urls_hash);
+      nonexisting_urls_hash = NULL;
+    }
+}
+
+void
+print_broken_links (void)
+{
+  hash_table_iterator iter;
+  int num_elems;
+  
+  if (!nonexisting_urls_hash) 
+    {
+      logprintf (LOG_NOTQUIET, _("Found no broken links.\n\n"));
+      return;
+    }
+  
+  num_elems = hash_table_count (nonexisting_urls_hash);
+  assert (num_elems > 0);
+  
+  if (num_elems > 1) 
+    {
+      logprintf (LOG_NOTQUIET, _("Found %d broken links.\n\n"), 
+                 num_elems);
+    }
+  else
+    {
+      logprintf (LOG_NOTQUIET, _("Found 1 broken link.\n\n"));
+    }
+  
+  for (hash_table_iterate (nonexisting_urls_hash, &iter);
+       hash_table_iter_next (&iter);
+       )
+    {
+      struct broken_urls_list *list;
+         
+      logprintf (LOG_NOTQUIET, _("%s referred by:\n"), (const char *)iter.key);
+
+      for (list = (struct broken_urls_list *) iter.value; 
+           list; 
+          list = list->next) 
+        {
+          logprintf (LOG_NOTQUIET, _("    %s\n"), list->url);
+        }
+    }
+  logputs (LOG_NOTQUIET, "\n");
+}
+
  \f
  /* The function returns the pointer to the malloc-ed quoted version of
     string s.  It will recognize and quote numeric and special graphic
diff --git a/src/convert.h b/src/convert.h

index fea808cf386ae49e695fd411393120f01f96e940..d2367885b4d2e0e235a13be86b834de2f40d976d 100644 (file)
--- a/src/convert.h
+++ b/src/convert.h
@@ -104,4 +104,7 @@ void convert_cleanup (void);
  
  char *html_quote_string (const char *);
  
+void nonexisting_url (const char *, const char *);
+void print_broken_links (void);
+
  #endif /* CONVERT_H */
diff --git a/src/ftp.c b/src/ftp.c

index b79b5d1416dbb7f008f6ca4ba4ee9b996cc2bc17..0ecb41883a598b2e8424b45161d99b614c406ffe 100644 (file)
--- a/src/ftp.c
+++ b/src/ftp.c
@@ -1773,7 +1773,7 @@ ftp_retrieve_glob (struct url *u, ccon *con, int action)
     of URL.  Inherently, its capabilities are limited on what can be
     encoded into a URL.  */
  uerr_t
-ftp_loop (struct url *u, int *dt, struct url *proxy)
+ftp_loop (struct url *u, int *dt, struct url *proxy, bool recursive, bool glob)
  {
    ccon con;                    /* FTP connection */
    uerr_t res;
@@ -1791,7 +1791,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy)
    /* If the file name is empty, the user probably wants a directory
       index.  We'll provide one, properly HTML-ized.  Unless
       opt.htmlify is 0, of course.  :-) */
-  if (!*u->file && !opt.recursive)
+  if (!*u->file && !recursive)
      {
        struct fileinfo *f;
        res = ftp_get_listing (u, &con, &f);
@@ -1832,7 +1832,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy)
    else
      {
        bool ispattern = false;
-      if (opt.ftp_glob)
+      if (glob)
         {
           /* Treat the URL as a pattern if the file name part of the
              URL path contains wildcards.  (Don't check for u->file
@@ -1843,7 +1843,7 @@ ftp_loop (struct url *u, int *dt, struct url *proxy)
             file_part = u->path;
           ispattern = has_wildcards_p (file_part);
         }
-      if (ispattern || opt.recursive || opt.timestamping)
+      if (ispattern || recursive || opt.timestamping)
         {
           /* ftp_retrieve_glob is a catch-all function that gets called
              if we need globbing, time-stamping or recursion.  Its
diff --git a/src/ftp.h b/src/ftp.h

index eed6bf763c116f1ddc7c7e871c3fc903952789a4..9110d818bbb853930f008835562b65dcb7f2defa 100644 (file)
--- a/src/ftp.h
+++ b/src/ftp.h
@@ -119,7 +119,7 @@ enum wget_ftp_fstatus
  };
  
  struct fileinfo *ftp_parse_ls (const char *, const enum stype);
-uerr_t ftp_loop (struct url *, int *, struct url *);
+uerr_t ftp_loop (struct url *, int *, struct url *, bool, bool);
  
  uerr_t ftp_index (const char *, struct url *, struct fileinfo *);
  
diff --git a/src/http.c b/src/http.c

index a0e04bfbc44c6eeeaa6dae031909d9735b8aef4c..54091bf428f372925d73093023b4e5ce9d9dd1f7 100644 (file)
--- a/src/http.c
+++ b/src/http.c
@@ -2309,7 +2309,7 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
        /* Default document type is empty.  However, if spider mode is
           on or time-stamping is employed, HEAD_ONLY commands is
           encoded within *dt.  */
-      if (opt.spider || (opt.timestamping && !got_head))
+      if ((opt.spider && !opt.recursive) || (opt.timestamping && !got_head))
          *dt |= HEAD_ONLY;
        else
          *dt &= ~HEAD_ONLY;
@@ -2400,20 +2400,26 @@ http_loop (struct url *u, char **newloc, char **local_file, const char *referer,
            /* All possibilities should have been exhausted.  */
            abort ();
          }
-      
+     
        if (!(*dt & RETROKF))
          {
+          char *hurl = NULL;
            if (!opt.verbose)
              {
                /* #### Ugly ugly ugly! */
-              char *hurl = url_string (u, true);
+              hurl = url_string (u, true);
                logprintf (LOG_NONVERBOSE, "%s:\n", hurl);
-              xfree (hurl);
+            }
+          if (opt.spider && opt.recursive)
+            {
+              if (!hurl) hurl = url_string (u, true);
+              nonexisting_url (hurl, referer);
              }
            logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"),
                       tms, hstat.statcode, escnonprint (hstat.error));
            logputs (LOG_VERBOSE, "\n");
            ret = WRONGCODE;
+          xfree_null (hurl);
            goto exit;
          }
  
@@ -2479,7 +2485,7 @@ The sizes do not match (local %s) -- retrieving.\n"),
          }
        
        if ((tmr != (time_t) (-1))
-          && !opt.spider
+          && (!opt.spider || opt.recursive)
            && ((hstat.len == hstat.contlen) ||
                ((hstat.res == 0) && (hstat.contlen == -1))))
          {
@@ -2498,7 +2504,7 @@ The sizes do not match (local %s) -- retrieving.\n"),
          }
        /* End of time-stamping section. */
  
-      if (opt.spider)
+      if (opt.spider && !opt.recursive)
          {
            logprintf (LOG_NOTQUIET, "%d %s\n\n", hstat.statcode,
                       escnonprint (hstat.error));
diff --git a/src/main.c b/src/main.c

index 1f3fb25a2172f90de209b9dc31e6807482611766..9bc979aa6c0eb5174bc9ff4b8bd4008f7fb9060e 100644 (file)
--- a/src/main.c
+++ b/src/main.c
@@ -948,7 +948,7 @@ Can't timestamp and not clobber old files at the same time.\n"));
           && url_scheme (*t) != SCHEME_FTP)
         status = retrieve_tree (*t);
        else
-       status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt);
+       status = retrieve_url (*t, &filename, &redirected_URL, NULL, &dt, opt.recursive);
  
        if (opt.delete_after && file_exists_p(filename))
         {
@@ -971,6 +971,13 @@ Can't timestamp and not clobber old files at the same time.\n"));
         logprintf (LOG_NOTQUIET, _("No URLs found in %s.\n"),
                    opt.input_filename);
      }
+
+  /* Print broken links. */
+  if (opt.recursive && opt.spider)
+    {
+      print_broken_links();
+    }
+  
    /* Print the downloaded sum.  */
    if ((opt.recursive || opt.page_requisites
         || nurl > 1
diff --git a/src/recur.c b/src/recur.c

index 1e277ca37dfa75663a8902926772dc2b5e67e35b..611e36061dd0b4eef974a95192c1b011af3a8152 100644 (file)
--- a/src/recur.c
+++ b/src/recur.c
@@ -246,11 +246,8 @@ retrieve_tree (const char *start_url)
         {
           int dt = 0;
           char *redirected = NULL;
-         bool oldrec = opt.recursive;
  
-         opt.recursive = false;
-         status = retrieve_url (url, &file, &redirected, referer, &dt);
-         opt.recursive = oldrec;
+         status = retrieve_url (url, &file, &redirected, referer, &dt, false);
  
           if (html_allowed && file && status == RETROK
               && (dt & RETROKF) && (dt & TEXTHTML))
@@ -348,17 +345,21 @@ retrieve_tree (const char *start_url)
             }
         }
  
-      if (opt.delete_after || (file && !acceptable (file)))
+      if (file 
+          && (opt.delete_after 
+              || opt.spider /* opt.recursive is implicitely true */
+              || !acceptable (file)))
         {
           /* Either --delete-after was specified, or we loaded this
-            otherwise rejected (e.g. by -R) HTML file just so we
-            could harvest its hyperlinks -- in either case, delete
-            the local file. */
+            (otherwise unneeded because of --spider or rejected by -R) 
+            HTML file just to harvest its hyperlinks -- in either case, 
+            delete the local file. */
           DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
                    opt.delete_after ? "--delete-after" :
-                  "recursive rejection criteria"));
+                  (opt.spider ? "--spider" : 
+                   "recursive rejection criteria")));
           logprintf (LOG_VERBOSE,
-                    (opt.delete_after
+                    (opt.delete_after || opt.spider
                       ? _("Removing %s.\n")
                       : _("Removing %s since it should be rejected.\n")),
                      file);
diff --git a/src/res.c b/src/res.c

index 630d74b5ce7ec4767f19052d0ef59f93385a974c..656f2895ec52e971d6f37cbd62138bc802fef049 100644 (file)
--- a/src/res.c
+++ b/src/res.c
@@ -538,7 +538,7 @@ res_retrieve_file (const char *url, char **file)
  
    logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
    *file = NULL;
-  err = retrieve_url (robots_url, file, NULL, NULL, NULL);
+  err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
    xfree (robots_url);
  
    if (err != RETROK && *file != NULL)
diff --git a/src/retr.c b/src/retr.c

index 14f4ffabecfb15906349ddef9c96601d74b6b409..18a7b3233817e1c6786284b5a1d472fbed2655cf 100644 (file)
--- a/src/retr.c
+++ b/src/retr.c
@@ -602,7 +602,7 @@ static char *getproxy (struct url *);
  
  uerr_t
  retrieve_url (const char *origurl, char **file, char **newloc,
-             const char *refurl, int *dt)
+             const char *refurl, int *dt, bool recursive)
  {
    uerr_t result;
    char *url;
@@ -684,13 +684,12 @@ retrieve_url (const char *origurl, char **file, char **newloc,
        /* If this is a redirection, temporarily turn off opt.ftp_glob
          and opt.recursive, both being undesirable when following
          redirects.  */
-      bool oldrec = opt.recursive, oldglob = opt.ftp_glob;
+      bool oldrec = recursive, glob = opt.ftp_glob;
        if (redirection_count)
-       opt.recursive = opt.ftp_glob = false;
+       oldrec = glob = false;
  
-      result = ftp_loop (u, dt, proxy_url);
-      opt.recursive = oldrec;
-      opt.ftp_glob = oldglob;
+      result = ftp_loop (u, dt, proxy_url, recursive, glob);
+      recursive = oldrec;
  
        /* There is a possibility of having HTTP being redirected to
          FTP.  In these cases we must decide whether the text is HTML
@@ -848,7 +847,7 @@ retrieve_from_file (const char *file, bool html, int *count)
           && cur_url->url->scheme != SCHEME_FTP)
         status = retrieve_tree (cur_url->url->url);
        else
-       status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt);
+       status = retrieve_url (cur_url->url->url, &filename, &new_file, NULL, &dt, opt.recursive);
  
        if (filename && opt.delete_after && file_exists_p (filename))
         {
diff --git a/src/retr.h b/src/retr.h

index a612458786712554ffe964312d9db5a253e9245b..3928cfd5ffe7d4be5930cc7ad8b06d6a0909cd5c 100644 (file)
--- a/src/retr.h
+++ b/src/retr.h
@@ -50,7 +50,7 @@ typedef const char *(*hunk_terminator_t) (const char *, const char *, int);
  char *fd_read_hunk (int, hunk_terminator_t, long, long);
  char *fd_read_line (int);
  
-uerr_t retrieve_url (const char *, char **, char **, const char *, int *);
+uerr_t retrieve_url (const char *, char **, char **, const char *, int *, bool);
  uerr_t retrieve_from_file (const char *, bool, int *);
  
  const char *retr_rate (wgint, double);
diff --git a/src/wget.h b/src/wget.h

index bfcaf8dd44c22ad7d514648d421325344e90577b..d0303e4070de258a71d6c124c3836641d16d2a21 100644 (file)
--- a/src/wget.h
+++ b/src/wget.h
@@ -283,18 +283,23 @@ enum
     simplified.  */
  typedef enum
  {
+  /*  0  */
    NOCONERROR, HOSTERR, CONSOCKERR, CONERROR, CONSSLERR,
-  CONIMPOSSIBLE, NEWLOCATION, NOTENOUGHMEM, CONPORTERR,
-  CONCLOSED, FTPOK, FTPLOGINC, FTPLOGREFUSED, FTPPORTERR, FTPSYSERR,
-  FTPNSFOD, FTPRETROK, FTPUNKNOWNTYPE, FTPRERR,
-  FTPREXC, FTPSRVERR, FTPRETRINT, FTPRESTFAIL, URLERROR,
-  FOPENERR, FOPEN_EXCL_ERR, FWRITEERR, HOK, HLEXC, HEOF,
+  CONIMPOSSIBLE, NEWLOCATION, NOTENOUGHMEM, CONPORTERR, CONCLOSED, 
+  /* 10  */
+  FTPOK, FTPLOGINC, FTPLOGREFUSED, FTPPORTERR, FTPSYSERR,
+  FTPNSFOD, FTPRETROK, FTPUNKNOWNTYPE, FTPRERR, FTPREXC, 
+  /* 20  */
+  FTPSRVERR, FTPRETRINT, FTPRESTFAIL, URLERROR, FOPENERR, 
+  FOPEN_EXCL_ERR, FWRITEERR, HOK, HLEXC, HEOF,
+  /* 30  */
    HERR, RETROK, RECLEVELEXC, FTPACCDENIED, WRONGCODE,
-  FTPINVPASV, FTPNOPASV,
-  CONTNOTSUPPORTED, RETRUNNEEDED, RETRFINISHED, READERR, TRYLIMEXC,
-  URLBADPATTERN, FILEBADFILE, RANGEERR, RETRBADPATTERN,
-  RETNOTSUP, ROBOTSOK, NOROBOTS, PROXERR, AUTHFAILED,
-  QUOTEXC, WRITEFAILED, SSLINITFAILED
+  FTPINVPASV, FTPNOPASV, CONTNOTSUPPORTED, RETRUNNEEDED, RETRFINISHED, 
+  /* 40  */
+  READERR, TRYLIMEXC, URLBADPATTERN, FILEBADFILE, RANGEERR, 
+  RETRBADPATTERN, RETNOTSUP, ROBOTSOK, NOROBOTS, PROXERR, 
+  /* 50  */
+  AUTHFAILED, QUOTEXC, WRITEFAILED, SSLINITFAILED
  } uerr_t;
  
  #endif /* WGET_H */
author	mtortonesi <devnull@localhost>
	Thu, 25 May 2006 16:11:29 +0000 (09:11 -0700)
committer	mtortonesi <devnull@localhost>
	Thu, 25 May 2006 16:11:29 +0000 (09:11 -0700)
src/ChangeLog		patch \| blob \| history
src/convert.c		patch \| blob \| history
src/convert.h		patch \| blob \| history
src/ftp.c		patch \| blob \| history
src/ftp.h		patch \| blob \| history
src/http.c		patch \| blob \| history
src/main.c		patch \| blob \| history
src/recur.c		patch \| blob \| history
src/res.c		patch \| blob \| history
src/retr.c		patch \| blob \| history
src/retr.h		patch \| blob \| history
src/wget.h		patch \| blob \| history