[svn] Be careful whether we want to descend into results of redirection.

author hniksic <devnull@localhost>

Mon, 26 Nov 2001 01:11:48 +0000 (17:11 -0800)

committer hniksic <devnull@localhost>

Mon, 26 Nov 2001 01:11:48 +0000 (17:11 -0800)
author hniksic <devnull@localhost>
Mon, 26 Nov 2001 01:11:48 +0000 (17:11 -0800)
committer hniksic <devnull@localhost>
Mon, 26 Nov 2001 01:11:48 +0000 (17:11 -0800)
diff --git a/src/ChangeLog b/src/ChangeLog

index c051e0204821c1ae570557392b604c2a9124d605..3a29317286c08b16dbb2ecc8a4dfea53cb1d57c7 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,8 @@
+2001-11-26  Hrvoje Niksic  <hniksic@arsdigita.com>
+
+       * recur.c (descend_redirect_p): New function.
+       (retrieve_tree): Make sure redirections are not blindly followed.
+
  2001-11-04  Alan Eldridge  <alane@geeksrus.net>
  
         * config.h.in: added HAVE_RANDOM.
diff --git a/src/recur.c b/src/recur.c

index 3bcae5239c0aa892ee479249389fb93a5c96373d..8e713832995b63b0af48e200cc12837ba2fd5f95 100644 (file)
--- a/src/recur.c
+++ b/src/recur.c
@@ -152,6 +152,9 @@ url_dequeue (struct url_queue *queue,
  \f
  static int descend_url_p PARAMS ((const struct urlpos *, struct url *, int,
                                   struct url *, struct hash_table *));
+static int descend_redirect_p PARAMS ((const char *, const char *, int,
+                                      struct url *, struct hash_table *));
+
  
  /* Retrieve a part of the web beginning with START_URL.  This used to
     be called "recursive retrieval", because the old function was
@@ -224,14 +227,25 @@ retrieve_tree (const char *start_url)
         status = retrieve_url (url, &file, &redirected, NULL, &dt);
         opt.recursive = oldrec;
  
+       if (file && status == RETROK
+           && (dt & RETROKF) && (dt & TEXTHTML))
+         descend = 1;
+
         if (redirected)
           {
+           /* We have been redirected, possibly to another host, or
+              different path, or wherever.  Check whether we really
+              want to follow it.  */
+           if (descend)
+             {
+               if (!descend_redirect_p (redirected, url, depth,
+                                        start_url_parsed, blacklist))
+                 descend = 0;
+             }
+
             xfree (url);
             url = redirected;
           }
-       if (file && status == RETROK
-           && (dt & RETROKF) && (dt & TEXTHTML))
-         descend = 1;
        }
  
        if (descend
@@ -307,7 +321,8 @@ retrieve_tree (const char *start_url)
                    opt.delete_after ? "--delete-after" :
                    "recursive rejection criteria"));
           logprintf (LOG_VERBOSE,
-                    (opt.delete_after ? _("Removing %s.\n")
+                    (opt.delete_after
+                     ? _("Removing %s.\n")
                       : _("Removing %s since it should be rejected.\n")),
                      file);
           if (unlink (file))
@@ -525,6 +540,43 @@ descend_url_p (const struct urlpos *upos, struct url *parent, int depth,
  
    return 0;
  }
+
+/* This function determines whether we should descend the children of
+   the URL whose download resulted in a redirection, possibly to
+   another host, etc.  It is needed very rarely, and thus it is merely
+   a simple-minded wrapper around descend_url_p.  */
+
+static int
+descend_redirect_p (const char *redirected, const char *original, int depth,
+                   struct url *start_url_parsed, struct hash_table *blacklist)
+{
+  struct url *orig_parsed, *new_parsed;
+  struct urlpos *upos;
+  int success;
+
+  orig_parsed = url_parse (original, NULL);
+  assert (orig_parsed != NULL);
+
+  new_parsed = url_parse (redirected, NULL);
+  assert (new_parsed != NULL);
+
+  upos = xmalloc (sizeof (struct urlpos));
+  memset (upos, 0, sizeof (*upos));
+  upos->url = new_parsed;
+
+  success = descend_url_p (upos, orig_parsed, depth,
+                          start_url_parsed, blacklist);
+
+  url_free (orig_parsed);
+  url_free (new_parsed);
+  xfree (upos);
+
+  if (!success)
+    DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
+
+  return success;
+}
+
  \f
  /* Register that URL has been successfully downloaded to FILE. */
  
@@ -572,32 +624,21 @@ register_html (const char *url, const char *file)
    downloaded_html_files = slist_prepend (downloaded_html_files, file);
  }
  
-/* convert_links() is called from recursive_retrieve() after we're
-   done with an HTML file.  This call to convert_links is not complete
-   because it converts only the downloaded files, and Wget cannot know
-   which files will be downloaded afterwards.  So, if we have file
-   fileone.html with:
-
-   <a href="/c/something.gif">
-
-   and /c/something.gif was not downloaded because it exceeded the
-   recursion depth, the reference will *not* be changed.
-
-   However, later we can encounter /c/something.gif from an "upper"
-   level HTML (let's call it filetwo.html), and it gets downloaded.
+/* This function is called when the retrieval is done to convert the
+   links that have been downloaded.  It has to be called at the end of
+   the retrieval, because only then does Wget know conclusively which
+   URLs have been downloaded, and which not, so it can tell which
+   direction to convert to.
  
-   But now we have a problem because /c/something.gif will be
-   correctly transformed in filetwo.html, but not in fileone.html,
-   since Wget could not have known that /c/something.gif will be
-   downloaded in the future.
+   The "direction" means that the URLs to the files that have been
+   downloaded get converted to the relative URL which will point to
+   that file.  And the other URLs get converted to the remote URL on
+   the server.
  
-   This is why Wget must, after the whole retrieval, call
-   convert_all_links to go once more through the entire list of
-   retrieved HTMLs, and re-convert them.
+   All the downloaded HTMLs are kept in downloaded_html_files, and
+   downloaded URLs in urls_downloaded.  All the information is
+   extracted from these two lists.  */
  
-   All the downloaded HTMLs are kept in downloaded_html_files, and downloaded URLs
-   in urls_downloaded.  From these two lists information is
-   extracted.  */
  void
  convert_all_links (void)
  {
author	hniksic <devnull@localhost>
	Mon, 26 Nov 2001 01:11:48 +0000 (17:11 -0800)
committer	hniksic <devnull@localhost>
	Mon, 26 Nov 2001 01:11:48 +0000 (17:11 -0800)
src/ChangeLog		patch \| blob \| history
src/recur.c		patch \| blob \| history